/* CSTLEMMA - trainable lemmatiser Copyright (C) 2002, 2005 Center for Sprogteknologi, University of Copenhagen This file is part of CSTLEMMA. CSTLEMMA is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. CSTLEMMA is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with CSTLEMMA; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include "option.h" #if (defined PROGLEMMATISE || defined PROGMAKEDICT) #include "freqfile.h" #endif #include "caseconv.h" #include "argopt.h" #include #include #include #include #include #include #ifdef COUNTOBJECTS int optionStruct::COUNT = 0; #endif #if defined PROGLEMMATISE const char optionStruct::DefaultSep[] = "|"; //const char DefaultCFormat[] = "$w\\t[$b]1[$b0$B][$b>1$B]\\t$t\\n"; const char optionStruct::DefaultCFormat[] = "$w\\t$b1[[$b?]~1$B]\\t$t\\n"; const char optionStruct::DefaultCFormat_NoDict[] = "$w\\t$B\\t$t\\n"; //const char DefaultCFormat_NoTags[] = "$w\\t[$b]1[$b0$B][$b>1$B]\\n"; const char optionStruct::DefaultCFormat_NoTags[] = "$w\\t$b1[[$b?]~1$B]\\n"; const char optionStruct::DefaultCFormat_NoTags_NoDict[] = "$w\\t$B\\n"; const char optionStruct::DefaultCFormatXML[] = "$b1[[$b?]~1$B]"; const char optionStruct::DefaultCFormatXML_NoDict[] = "$B"; const char optionStruct::Default_b_format[] = "$w"; const char * optionStruct::Default_B_format = optionStruct::Default_b_format; #endif static char opts[] = "?@:A:b:B:c:C:d:De:f:FH:hi:I:k:l:Lm:n:N:o:p:q:R:s:t:u:U:v:W:x:X:y:z:" /* GNU: */ "wr"; /* static char ** poptions = NULL; static char * options = NULL; */ static char *** Ppoptions = NULL; static char ** Poptions = NULL; static int optionSets = 0; char * dupl(const char * s) { char * d = new char[strlen(s) + 1]; strcpy(d,s); return d; } optionStruct::optionStruct() { #if defined PROGLEMMATISE defaultbformat = true; defaultBformat = true; defaultCformat = true; dictfile = NULL; v = NULL; x = NULL; XML = false; ancestor = NULL; // if not null, restrict lemmatisation to elements that are offspring of ancestor element = NULL; // if null, analyse all PCDATA that is text wordAttribute = NULL; // if null, word is PCDATA POSAttribute = NULL; // if null, POS is PCDATA lemmaAttribute = NULL; // if null, Lemma is PCDATA lemmaClassAttribute = NULL; // if null, lemma class is PCDATA z = NULL; #endif #if (defined PROGMAKESUFFIXFLEX || defined PROGLEMMATISE) flx = NULL; #endif #if defined PROGLEMMATISE InputHasTags = true; keepPunctuation = 1; Sep = dupl(DefaultSep); #endif whattodo = LEMMATISE; argi = NULL; argo = NULL; arge = NULL; cformat = NULL;//dupl(DefaultCFormat); nice = false; #if defined PROGMAKEDICT CollapseHomographs = true; freq = NULL; #endif #if defined PROGLEMMATISE Wformat = NULL; bformat = NULL;//dupl(Default_b_format); Bformat = NULL;//dupl(Default_B_format); SortOutput = 0; RulesUnique = true; DictUnique = true; Iformat = NULL; UseLemmaFreqForDisambiguation = 0; baseformsAreLowercase = //false;/*20090731*/// true; size = ULONG_MAX; treatSlashAsAlternativesSeparator = false; #endif #ifdef COUNTOBJECTS ++COUNT; #endif #if defined PROGMAKESUFFIXFLEX showRefcount = false; CutoffRefcount = 0; #endif } optionStruct::~optionStruct() { for(int i = 0;i < optionSets;++i) { delete [] Poptions[i]; delete [] Ppoptions[i]; } delete [] Poptions; delete [] Ppoptions; delete [] cformat; #if defined PROGLEMMATISE delete [] bformat; delete [] Bformat; delete [] Wformat; delete [] Iformat; delete [] Sep; delete [] ancestor; delete [] element; delete [] wordAttribute; delete [] POSAttribute; delete [] lemmaAttribute; delete [] lemmaClassAttribute; #endif #ifdef COUNTOBJECTS --COUNT; #endif } OptReturnTp optionStruct::doSwitch(int c,char * locoptarg,char * progname) { switch (c) { case '@': readOptsFromFile(locoptarg,progname); break; #if defined PROGLEMMATISE case 'A': if(locoptarg && *locoptarg == '-') { treatSlashAsAlternativesSeparator = false; } else { treatSlashAsAlternativesSeparator = true; } break; case 'b': setbformat(locoptarg); // bformat = dupl(locoptarg); // defaultbformat = false; break; case 'B': setBformat(locoptarg); // Bformat = dupl(locoptarg); // defaultBformat = false; break; #endif case 'c': cformat = dupl(locoptarg); defaultCformat = false; break; #if defined PROGMAKESUFFIXFLEX case 'C': //CutoffRefcount = locoptarg == NULL || *locoptarg != '-'; if(!locoptarg || *locoptarg == '-') CutoffRefcount = 0; else CutoffRefcount = strtol(locoptarg,NULL,10); break; break; #endif #if defined PROGLEMMATISE case 'd': dictfile = locoptarg; break; #endif case 'D': whattodo = MAKEDICT; break; case 'e': arge = locoptarg; switch(*arge) { case '0': case '1': case '2': case '7': case '9': setEncoding(*arge - '0'); break; case 'u': case 'U': setEncoding(ENUNICODE); break; } break; #if (defined PROGMAKESUFFIXFLEX || defined PROGLEMMATISE) case 'f': flx = locoptarg; break; #endif case 'F': whattodo = MAKEFLEXPATTERNS; break; case 'h': case '?': printf("usage:\n"); printf("============================\n"); #if defined PROGMAKEDICT printf(" Create binary dictionary\n"); printf("%s -D \\\n",progname); printf(" -c [-N -n] [-y[-]] \\\n" " [-i] [-o]\n" " -c column format of dictionary (tab separated), e.g. -cBFT, which means:\n" " 1st column B(ase form), 2nd column F(ull form), 3rd column T(ype)\n" " -n column format of frequency file (tab separated)\n" " Example: -nN?FT, which means:\n" " 1st column N(frequency), 2nd column irrelevant,\n" " 3rd column F(ull form), 4th column T(ype)\n" " -y test output\n -y- release output (default)\n" " -k collapse homographs (remove \",n\" endings)(default)\n" " -k- do not collapse homographs (keep \",n\" endings)\n"); // printf("--More--");getchar(); printf("===============================\n"); #endif #if defined PROGMAKESUFFIXFLEX printf(" Create or add flex patterns\n"); printf("%s -F \\\n",progname); printf(" -c [-y[-]] [-i] \\\n" " [-f] [-o]\n" " -c column format, e.g. -cBFT, which means:\n" " 1st column B(aseform), 2nd column F(ullform), 3rd column T(ype)\n" " For lemmatising untagged text, suppress lexical type information by\n" " specifying '?' for the column containing the type.\n" " -y test output\n -y- release output (default)\n"); printf(" -R- Do not append refcount to base form (default)\n");// Bart 20050905 printf(" -R Append refcount to base form (format: [#])\n");// Bart 20050905 printf(" -C- Include all rules in output (default)\n");// Bart 20050905 printf(" -C Do not include rules with refcount <= \n");// Bart 20050905 // printf("--More--");getchar(); printf("=============\n"); #endif #if defined PROGLEMMATISE printf(" Lemmatise\n"); // printf("%s [-L] -c -b -B [-s[]] [-u[-]] -d -f [-z] [-i] [-o] [-m] [-n] [-x]\n",argv[0]); printf("%s [-L] \\\n",progname); printf(" -f [-d] [-u[-]] [-v[-]] \\\n" " [-I] [-i] [-o] \\\n" " [-c] [-b] [-B] [-W] [-s[]] \\\n" " [-x] [-v] \\\n" " [-z] [-@