/*
CSTLEMMA - trainable lemmatiser using word-end inflectional rules

Copyright (C) 2002, 2004  Center for Sprogteknologi, University of Copenhagen

This file is part of CSTLEMMA.

CSTLEMMA is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.

CSTLEMMA is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with CSTLEMMA; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
*/
#define CSTLEMMAVERSION "2.0"
#define CSTLEMMADATE "2004.01.07"
#define CSTLEMMACOPYRIGHT "2002-2004 Center for Sprogteknologi"
#include "argopt.h"
#include "lemmtags.h"
#include "freqfile.h"
#include "tags.h"
#include "flex.h"
#include "dictionary.h"
#include "makedict.h"
#include "text.h"
#include "basefrm.h"
#include <ctype.h>
#include <time.h>
#include <stdlib.h>
#include <string.h>
#include <stdarg.h>
#include <limits.h>

typedef enum {GoOn = 0,Leave = 1,Error = 2} OptReturnTp;
typedef enum {MAKEDICT,MAKEFLEXPATTERNS,LEMMATISE} whattodoTp;
const char DefaultSep[] = "|";
//const char DefaultCFormat[] = "$w\\t[$b]1[$b0$B][$b>1$B]\\t$t\\n";
const char DefaultCFormat[] = "$w\\t$b1[[$b?]~1$B]\\t$t\\n";
//const char DefaultCFormat_NoTags[] = "$w\\t[$b]1[$b0$B][$b>1$B]\\n";
const char DefaultCFormat_NoTags[] = "$w\\t$b1[[$b?]~1$B]\\n";
const char Default_b_format[] = "$w";
const char * Default_B_format = Default_b_format;


static bool DoInfo = true;

static int info(const char *fmt, ...)
    {
    if(DoInfo)
        {
        int ret;
        va_list ap;
        va_start(ap,fmt);
        ret = vprintf(fmt,ap);
        va_end(ap);
        return ret;
        }
    return 0;
    }


struct optionStruct
    {
    bool nice;
    bool SortOutput;
    bool RulesUnique;
    bool DictUnique;
    bool InputHasTags;
    bool CollapseHomographs;
    bool baseformsAreLowercase;
    bool treatSlashAsAlternativeSeparator;
    char * dictfile;
    char * v;
    char * x;
    char * z;
    char * flx;
    char * Iformat;
    char * Wformat;
    char * argi;
    char * argo;
    const char * Sep;
    const char * cformat;
    const char * bformat;
    const char * Bformat;
    FreqFile * freq;
    int keepPunctuation;
    int listLemmas;
    int UseLemmaFreqForDisambiguation;
    unsigned long int size;
    whattodoTp whattodo;
    optionStruct()
        {
        dictfile = NULL;
        v = NULL;
        x = NULL;
        z = NULL;
        flx = NULL;
        InputHasTags = true;
        CollapseHomographs = true;
        keepPunctuation = 1;
        Sep = DefaultSep;
        whattodo = LEMMATISE;
        argi = NULL;
        argo = NULL;
        cformat = DefaultCFormat;
        Wformat = NULL;
        bformat = Default_b_format;
        Bformat = Default_B_format;
        freq = NULL;
        nice = false;
        SortOutput = true;
        RulesUnique = true;
        DictUnique = true;
        listLemmas = 0;
        Iformat = NULL;
        UseLemmaFreqForDisambiguation = 0;
        baseformsAreLowercase = true;
        size = ULONG_MAX;
        treatSlashAsAlternativeSeparator = false;
        }
    };


flex Flex;




#if defined _WIN32
#define commandlineQuote "\""
#else
#define commandlineQuote "\'"
#endif

static OptReturnTp readOptsFromFile(char * locoptarg,char * progname,optionStruct & Option);


static OptReturnTp doSwitch(int c,char * locoptarg,char * progname,optionStruct & Option)
    {
    switch (c)
        {
// GNU >>
        case 'w':
            printf("11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY\n");
            printf("FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW.  EXCEPT WHEN\n");
            printf("OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES\n");
            printf("PROVIDE THE PROGRAM \"AS IS\" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED\n");
            printf("OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF\n");
            printf("MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  THE ENTIRE RISK AS\n");
            printf("TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU.  SHOULD THE\n");
            printf("PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,\n");
            printf("REPAIR OR CORRECTION.\n");
            return Leave;
        case 'r':
            printf("12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING\n");
            printf("WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR\n");
            printf("REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,\n");
            printf("INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING\n");
            printf("OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED\n");
            printf("TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY\n");
            printf("YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER\n");
            printf("PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE\n");
            printf("POSSIBILITY OF SUCH DAMAGES.\n");
            return Leave;
// << GNU
        case '@':
            readOptsFromFile(locoptarg,progname,Option);
            break;
        case 'A':
            if(locoptarg && *locoptarg == '-')
                {
                Option.treatSlashAsAlternativeSeparator = false;
                }
            else
                {
                Option.treatSlashAsAlternativeSeparator = true;
                }	    
            break;
        case 'D':
            Option.whattodo = MAKEDICT;
            break;
        case 'F':
            Option.whattodo = MAKEFLEXPATTERNS;
            break;
        case 'L':
            Option.whattodo = LEMMATISE; // default action
            break;
/*            case 'a':
            if(locoptarg && *locoptarg == '-')
                OutputHasFullForm = false;
            else
                OutputHasFullForm = true;
            break;*/
        case 'b':
            Option.bformat = locoptarg; 
            break;
        case 'B':
            Option.Bformat = locoptarg; 
            break;
        case 'c':
            Option.cformat = locoptarg;
            break;
        case 'W':
            Option.Wformat = locoptarg;
            break;
        case 'd':
            Option.dictfile = locoptarg;
            break;
        case 'f':
            Option.flx = locoptarg;
            break;
        case 'H':
            if(locoptarg)
                {
                Option.UseLemmaFreqForDisambiguation = *locoptarg - '0';
                if(Option.UseLemmaFreqForDisambiguation < 0 || Option.UseLemmaFreqForDisambiguation > 2)
                    {
                    printf("-H option: specify -H0, -H1 or -H2 (found -H%s)\n",locoptarg);
                    return Error;
                    }
                }
            else
                {   
                printf("-H option: specify -H0, -H1 or -H2\n");
                return Error;
                }
            break;
        case 'h':
        case '?':
            printf("usage:\n");
            printf("============================\n");
            printf("    Create binary dictionary\n");
            printf("%s -D \\\n",progname);
            printf("         -c<format> [-N<frequency file> -n<format>] [-y[-]] \\\n");
            printf("        [-i<lemmafile>] [-o<binarydictionary>]\n");
            printf("    -c  column format of dictionary (tab separated), e.g. -cBFT, which means:\n");
            printf("        1st column B(ase form), 2nd column F(ull form), 3rd column T(ype)\n");
            printf("    -n  column format of frequency file (tab separated)\n");
            printf("        Example: -nN?FT, which means:\n");
            printf("        1st column N(frequency), 2nd column irrelevant,\n");
            printf("        3rd column F(ull form), 4th column T(ype)\n");
            printf("    -y  test output\n    -y- release output (default)\n");
            printf("    -k  collapse homographs (remove \",n\" endings)(default)\n");
            printf("    -k- do not collapse homographs (keep \",n\" endings)\n");
//                printf("--More--");getchar();
            printf("===============================\n");
            printf("    Create or add flex patterns\n");
            printf("%s -F \\\n",progname);
            printf("         -c<format> [-y[-]] [-i<lemmafile>] \\\n");
            printf("        [-f<old flexpatterns>] [-o<new flexpatterns>]\n");
            printf("    -c  column format, e.g. -cBFT, which means:\n");
            printf("        1st column B(aseform), 2nd column F(ullform), 3rd column T(ype)\n");
            printf("        For lemmatising untagged text, suppress lexical type information by\n"
                   "        specifying '?' in place of 'T'\n");
            printf("    -y  test output\n    -y- release output (default)\n");
            printf("--More--");getchar();
            printf("=============\n");
            printf("    Lemmatise\n");
//                printf("%s [-L] -c<format> -b<format> -B<format> [-s[<sep>]] [-u[-]] -d<binarydictionary> -f<flexpatterns> [-z<type conversion table>] [-i<input text>] [-o<output text>] [-m<conflicts>] [-n<newlemmas>] [-x<Lexical type translation table>]\n",argv[0]);
            printf("%s [-L] \\\n",progname);
            printf("         -f<flex patterns> [-d<binary dictionary>] [-u[-]] [-v[-]] \\\n");
            printf("         [-I<input format>] [-i<input text>] [-o<output text>] \\\n");
            printf("         [-c<format>] [-b<format>] [-B<format>] [-W<format>] [-s[<sep>]] \\\n");
            printf("         [-x<Lexical type translation table>] [-v<tag friends file>] \\\n");
            printf("         [-z<type conversion table>] [-@<option file>]\n");
            printf("    -i<input text>\tIf -t- defined: any flat text. Otherwise: words must be\n"
                   "        followed by tags, separated by '/'. Default: standard input.\n");  
            printf("    -I<format>\tInput format (if not word/tag (-t) or word (-t-)).\n"); 
            printf("        $w word to be lemmatised\n"); 
            printf("        $t tag\n"); 
            printf("        $d dummy\n"); 
            printf("        \\t tab\n"); 
            printf("        \\n new line\n"); 
            printf("        \\s white space\n"); 
            printf("        \\S all except white space\n"); 
            printf("    -o<output text>\tOutput format dependent on -b, -B, -c and -W arguments.\n"
                   "        Default output: standard output\n");  
            printf("    -d<binarydictionary>\tDictionary as produced with the -D option set.\n");  
            printf("        If no dictionary is specified, only the flex patterns are used.\n");  
            printf("        Without dictionary, wrong tags in the input can not be corrected.\n");  
            printf("    -f<flexpatterns>\tFile with flex patterns. (see -F). Best results for\n"
                   "        untagged input are obtained if the rules are made without lexical type\n"
                   "        information. See -c option above.\n");  
            printf("    -b<format string>\tdefault:" commandlineQuote "%s" commandlineQuote "\n",Default_b_format);  
            printf("        Output format for data pertaining to the base form, according to the\n");
            printf("        dictionary:\n");
            printf("        $f sum of frequencies of the words $W having the base form $w (lemmafrequency).\n");
            /*
            printf("        $f base form type or token frequency.\n");
            printf("           (The frequency of the base form type is given if you have\n");
            printf("            (a) specified $f in the -c<format> argument, or\n");
            printf("            (b) specified a -W<format> argument, or\n");
            printf("            (c) specified a -H0 or -H1 argument.\n");
            printf("            Otherwise, base form token frequency is given.)\n");
            */
#if FREQ24
            printf("        $n frequency of the full form $w/$t in \"standard\" corpus.\n");
#endif
//                printf("        $p probability of this lexical type (%%) = 100x$n/sum($n).\n");
            printf("        $t lexical type\n");
            printf("        $w base form\n");
            printf("        $W full form(s)\n");
            printf("        \\$ dollar\n");
            printf("        \\[ [\n");
            printf("        \\] ]\n");
            printf("        Example: -b" commandlineQuote "$f $w/$t" commandlineQuote "\n");
            printf("    -B<format string>\tdefault:" commandlineQuote "%s" commandlineQuote "\n",Default_B_format);  
            printf("        Output format for data pertaining to the base form, as predicted by\n");
            printf("        flex pattern rules. See -b\n");
            printf("--More--");getchar();
            printf("    -W<format string>\tdefault: not present.\n");  
            printf("        Output format:\n");
            printf("        $w full form\n");
            printf("        $t lexical type(s) according to dictionary\n");
            printf("        $f full form type frequency\n");
            printf("        $i info:  -    full form not in dictionary\n");
            printf("                  +    full form in dictionary, but other type\n");
            printf("               (blank) full form in dictionary\n");
            printf("        \\t tab\n");
            printf("        $X?, [X]? Do not output X. (X can be tested, though).\n");
            printf("        [X]+  Output X only if X occurs at least once. (X is an expression\n");
            printf("              containing $b or $B)\n");
            printf("        [X]>n Output X only if X occurs more than n times.\n");
            printf("        [X]n  Output X only if X occurs exactly n times.\n");
            printf("        [X]<n Output X only if X occurs less than n times.\n");
            printf("        [X]   Output X if all nested conditions are met, or if X occurs\n");
            printf("              at least once. ([X] itself is always met!)\n");
            printf("        Example: -c" commandlineQuote "$w/$t" commandlineQuote "\n");
            printf("        Example: -c" commandlineQuote "[+$b?]>0[-$b0]$w\\n" commandlineQuote "\n");
            printf("          (Output +<word> if word is found in dictionary, otherwise -<word>)\n");
            printf("--More--");getchar();
            printf("    -c<format string>\tdefault:\t" commandlineQuote "%s" commandlineQuote "\n",DefaultCFormat);// word/lemma/tag lemma: if dictionary gives 1 solution, take dictionary, otherwise rules
            printf("        Output format:\n");
            printf("        $w full form\n");
            printf("        $b base form(s) according to dictionary.\n");
            printf("           (You also need to specify -b<format>)\n");
            printf("           (If the full form is found in the dictionary and tag=lexical type,\n");
            printf("            then only one base form is output.\n");
            printf("            Otherwise all base forms are output)\n");
            printf("        $B base form(s) according to flex pattern rules\n");
            printf("           (You also need to specify -B<format>)\n");
            printf("           (only if full form not in dictionary, or in dictionary,\n");
            printf("            but with other lexical type.)\n");
            printf("        $t lexical type(s) according to dictionary\n");
            printf("        $f full form frequency\n");
            printf("        $i info: indicates - full form not in dictionary\n");
            printf("                           + full form in dictionary, but other type\n");
            printf("                           * full form in dictionary\n");
            printf("        \\t tab\n");
            printf("        $X?, [X]? Do not output X. (X can be tested, though).\n");
            printf("        $b and $B are variables: they can occur any number of times,\n");
            printf("        including zero. This number can be tested in conditions:\n");
            printf("        $bn   Output $b only if $b occurs exactly n-times (n >= 0).\n");
            printf("        $Bn   Output $B only if $B occurs exactly n-times (n >= 0).\n");
            printf("        [X]+  Output X only if X occurs at least once. (X is an expression\n");
            printf("              containing $b or $B)\n");
            printf("        [X]>n Output X only if X occurs more than n times.\n");
            printf("        [X]n  Output X only if X occurs exactly n times.\n");
            printf("        [X]<n Output X only if X occurs less than n times.\n");
            printf("        [X]   Output X if all nested conditions are met, or if X occurs\n");
            printf("              at least once. ([X] itself is always met!)\n");
            printf("        Example: -c" commandlineQuote "$w\t/$t" commandlineQuote "\n");
            printf("        Example: -c" commandlineQuote "[+$b?]>0[-$b0]$w\\n" commandlineQuote "\n");
            printf("          (Output +<word> if word is found in dictionary, otherwise -<word>)\n");
            printf("--More--");getchar();
            printf("    -l  force lemma to all-lowercase (default)\n");
            printf("    -l- make case of lemma similar to full form's case\n");
            printf("    -p  keep punctuation (default)\n");
            printf("    -p- ignore punctuation (only together with -t- and no -W format)\n");
            printf("    -p+ treat punctuation as tokens (only together with -t- and no -W format)\n");
            printf("    -q  sort output\n");
            printf("    -q- do not sort output (default)\n");
            printf("    -s<sep> multiple base forms (-b -B) are <sep>-separated. Example: -s" commandlineQuote " | " commandlineQuote "\n");
            printf("    -s  multiple base forms (-b -B) are " commandlineQuote "%s" commandlineQuote "-separated (default)\n",DefaultSep);
            printf("    -t  input text is tagged (default)\n    -t- input text is not tagged\n");
            printf("    -U  enforce unique flex rules (default)\n");
            printf("    -U- allow ambiguous flex rules\n");
            printf("    -u  enforce unique dictionary look-up (default)\n");
            printf("    -u- allow ambiguous dictionary look-up\n");
            printf("    -Hn n = 0: use lemma frequencies for disambiguation (default)\n");
            printf("        n = 1: use lemma frequencies for disambiguation,\n");
            printf("               show candidates for pruning between << and >>\n");
            printf("        n = 2: do not use lemma frequencies for disambiguation.\n");
            printf("    -v<tag friends file>: Use this to coerce the nearest fit between input\n"
                   "        tag and the dictionary's lexical types if the dictionary has more than\n"
                   "        one readings of the input word and none of these has a lexical type\n"
                   "        that exactly agrees with the input tag. Format:\n"
                   "             {<dict type> {<space> <tag>}* <newline>}*\n"
                   "        The more to the left the tag is, the better the agreement with the\n"
                   "        dictionary'e lexical type\n");
            printf("    -x<Lexical type translation table>: Use this to handle tagged texts with\n"
                   "        tags that do not occur in the dictionary. Format:\n"
                   "             {<dict type> {<space> <tag>}* <newline>}*\n");
            printf("    -z<type conversion table>: Use this to change the meaning of $t in -b and\n"
                   "        -B formats. Without conversion table, $t is the lexical type of the\n"
                   "        full form. With conversion table, $t is the lexical type of the base\n"
                   "        form, as defined by the table. Format:\n"
                   "             {<base form type> <space> <full form type> <newline>}*\n");
            printf("    -m<size>: Max. number of words in input. Default: 0 (meaning: unlimited)\n");
            printf("    -A  Treat / as separator between alternative words.\n"); // Bart 20030108
            printf("    -A- Do not treat / as separator between alternative words (default)\n");// Bart 20030108
            return Leave;
        case 'i':
            Option.argi = locoptarg;
            break;
        case 'I':
            Option.Iformat = locoptarg; 
            break;
            /*
        case 'm': // file containing conflicts
            argm = locoptarg;
            break;
        case 'n': // file containing new words
            argn = locoptarg;
            break;
            */
        case 'k':
            if(locoptarg && *locoptarg == '-')
                {
                Option.CollapseHomographs = false;
                }
            else
                {
                Option.CollapseHomographs = true;
                }
            break;
        case 'l':
            if(locoptarg && *locoptarg == '-')
                {
                Option.baseformsAreLowercase = false;
                }
            else
                {
                Option.baseformsAreLowercase = true;
                }
            break;
            
        case 'm':
            if(locoptarg)
                {
                Option.size = strtoul(locoptarg,NULL,10);
                printf("siez %lu\n",Option.size);
                if(Option.size == 0)
                    Option.size = ULONG_MAX;
                printf("siez %lu\n",Option.size);
                }
            else
                Option.size = ULONG_MAX;
            break;
        case 'n':
//Bart 20021223            if(Option.freq)
                {
                if(!Option.freq)
                    {
                    Option.freq = new FreqFile();
                    }
                (Option.freq)->addFormat(locoptarg);
                }
            break;
        case 'N':
//Bart 20021223            if(Option.freq)
                {
                if(!Option.freq)
                    {
                    Option.freq = new FreqFile();
                    }
                (Option.freq)->addName(locoptarg);
                }
            break;
        case 'o':
            Option.argo = locoptarg;
            break;
        case 'p':
            if(locoptarg)
                {
                if(*locoptarg == '-')
                    {
                    Option.keepPunctuation = 0;
                    }
                else if(*locoptarg == '+')
                    {
                    Option.keepPunctuation = 2;
                    }
                else
                    {
                    printf("Invalid argument %s for -p option.\n",locoptarg);
                    return Error;
                    }
                }
            else
                {
                Option.keepPunctuation = 1;
                }
            break;
        case 'q':
            if(locoptarg && *locoptarg == '-')
                {
                Option.SortOutput = false;
                }
            else
                {
                Option.SortOutput = true;
                }
            break;
        case 's':
            if(locoptarg && *locoptarg)
                {
                for(char * p = locoptarg;*p;)
                    {
                    if(*p == '\\')
                        {
                        switch(*(p + 1))
                            {
                            case 't':
                                *p++ = '\t';
                                memmove(p,p+1,strlen(p));
                                break;
                            case 'n':
                                *p++ = '\n';
                                memmove(p,p+1,strlen(p));
                                break;
                            default:
                                *p = *(p+1);
                                ++p;
                                memmove(p,p+1,strlen(p));
                                break;
                            }
                        }
                    else
                        ++p;
                    }
                Option.Sep = locoptarg;
                }
            else
                Option.Sep = DefaultSep;
            break;
        case 't':
            if(locoptarg && *locoptarg == '-')
                Option.InputHasTags = false;
            else
                Option.InputHasTags = true;
            break;
        case 'U':
            if(locoptarg && *locoptarg == '-')
                Option.RulesUnique = false;
            else
                Option.RulesUnique = true;
            break;
        case 'u':
            if(locoptarg && *locoptarg == '-')
                Option.DictUnique = false;
            else
                Option.DictUnique = true;
            break;
        case 'v':
            Option.v = locoptarg;
            break;
        case 'x':
            Option.x = locoptarg;
            break;
        case 'y':
            if(locoptarg && *locoptarg == '-')
                Option.nice = false;
            else
                Option.nice = true;
            break;
        case 'z':
            Option.z = locoptarg;
            break;
        }
    return GoOn;
    }

static char opts[] = "?@:A:b:B:c:d:Df:FH:hi:I:k:l:Lm:n:N:o:p:q:s:t:u:U:v:W:x:y:z:" /* GNU: */ "wr";
/*
static char ** poptions = NULL;
static char * options = NULL;
*/
static char *** Ppoptions = NULL;
static char ** Poptions = NULL;
static int optionSets = 0;

static OptReturnTp readOptsFromFile(char * locoptarg,char * progname,optionStruct & Option)
    {
    char ** poptions;
    char * options;
    FILE * fpopt = fopen(locoptarg,"r");
    OptReturnTp result = GoOn;
    if(fpopt)
        {
        char line[1000];
        int lineno = 0;
        int bufsize = 0;
        while(fgets(line,sizeof(line) - 1,fpopt))
            {
            lineno++;
            int off = strspn(line," \t");
            if(line[off] == ';')
                continue; // comment line
            if(line[off] == '-')
                {
                off++;
                if(line[off])
                    {
                    char * optarg2 = line + off + 1;
                    int off2 = strspn(optarg2," \t");
                    if(!optarg2[off2])
                        optarg2 = NULL;
                    else
                        optarg2 += off2;
                    if(optarg2)
                        {
                        for(char * p = optarg2 + strlen(optarg2) - 1;p >= optarg2;--p)
                            {
                            if(!isspace(*p))
                                break;
                            *p = '\0';
                            }
                        bool string = false;
                        if(*optarg2 == '\'' || *optarg2 == '"')
                            {

                            // -x 'jhgfjhagj asdfj\' hsdjfk' ; dfaasdhfg
                            // -x 'jhgfjhagj asdfj\' hsdjfk' ; dfa ' asdhfg
                            // -x "jhgfjhagj \"asdfj hsdjfk" ; dfaasdhfg
                            // -x "jhgfjhagj \"asdfj hsdjfk" ; dfa " asdhfg
                            for(char * p = optarg2 + strlen(optarg2) - 1;p > optarg2;--p)
                                {
                                if(*p == *optarg2)
                                    {
                                    string = true;
                                    for(char * q = p + 1;*q;++q)
                                        {
                                        if(*q == ';')
                                            break;
                                        if(!isspace(*q))
                                            {
                                            string = false;
                                            }
                                        }
                                    if(string)
                                        {
                                        *p = '\0';
                                        ++optarg2;
                                        }
                                    break;
                                    }
                                }
                            }
                        if(!*optarg2 && !string)
                            optarg2 = NULL;
                        }
                    if(optarg2)
                        {
                        bufsize += strlen(optarg2) + 1;
                        }
                    char * optpos = strchr(opts,line[off]);
                    if(optpos)
                        {
                        if(optpos[1] != ':')
                            {
                            if(optarg2)
                                {
                                printf("Option argument %s provided for option letter %c that doesn't use it on line %d in option file \"%s\"\n",optarg2,line[off],lineno,locoptarg);
                                exit(1);
                                }
                            }
                        }
                    }
                else
                    {
                    printf("Missing option letter on line %d in option file \"%s\"\n",lineno,locoptarg);
                    exit(1);
                    }
                }
            }
        rewind(fpopt);

        poptions = new char * [lineno];
        options = new char[bufsize];
        // update stacks that keep pointers to the allocated arrays.
        optionSets++;
        char *** tmpPpoptions = new char **[optionSets];
        char ** tmpPoptions = new char *[optionSets];
        int g;
        for(g = 0;g < optionSets - 1;++g)
            {
            tmpPpoptions[g] = Ppoptions[g];
            tmpPoptions[g] = Poptions[g];
            }
        tmpPpoptions[g] = poptions;
        tmpPoptions[g] = options;
        delete [] Ppoptions;
        Ppoptions = tmpPpoptions;
        delete [] Poptions;
        Poptions = tmpPoptions;

        lineno = 0;
        bufsize = 0;
        while(fgets(line,sizeof(line) - 1,fpopt))
            {
            poptions[lineno] = options+bufsize;
            int off = strspn(line," \t");
            if(line[off] == ';')
                continue; // comment line
            if(line[off] == '-')
                {
                off++;
                if(line[off])
                    {
                    char * optarg2 = line + off + 1;
                    int off2 = strspn(optarg2," \t");
                    if(!optarg2[off2])
                        optarg2 = NULL;
                    else
                        optarg2 += off2;
                    if(optarg2)
                        {
                        for(char * p = optarg2 + strlen(optarg2) - 1;p >= optarg2;--p)
                            {
                            if(!isspace(*p))
                                break;
                            *p = '\0';
                            }
                        bool string = false;
                        if(*optarg2 == '\'' || *optarg2 == '"')
                            {

                            // -x 'jhgfjhagj asdfj\' hsdjfk' ; dfaasdhfg
                            // -x 'jhgfjhagj asdfj\' hsdjfk' ; dfa ' asdhfg
                            // -x "jhgfjhagj \"asdfj hsdjfk" ; dfaasdhfg
                            // -x "jhgfjhagj \"asdfj hsdjfk" ; dfa " asdhfg
                            for(char * p = optarg2 + strlen(optarg2) - 1;p > optarg2;--p)
                                {
                                if(*p == *optarg2)
                                    {
                                    string = true;
                                    for(char * q = p + 1;*q;++q)
                                        {
                                        if(*q == ';')
                                            break;
                                        if(!isspace(*q))
                                            {
                                            string = false;
                                            }
                                        }
                                    if(string)
                                        {
                                        *p = '\0';
                                        ++optarg2;
                                        }
                                    break;
                                    }
                                }
                            }
                        if(!*optarg2 && /*Bart 20030905: allow empty string for e.g. -s option*/!string)
                            optarg2 = NULL;
                        }
                    if(optarg2)
                        {
                        strcpy(poptions[lineno],optarg2);
                        bufsize += strlen(optarg2) + 1;
                        }
                    /*else
                        optarg2 = "";
                    char * optpos = strchr(opts,line[off]);*/
                    OptReturnTp res = doSwitch(line[off],poptions[lineno],progname,Option);
                    if(res > result)
                        result = res;
                    }
                }
            lineno++;
            }
        fclose(fpopt);
        }
    else
        {
        printf("Cannot open option file %s\n",locoptarg);
        }
    return result;
    }



static OptReturnTp readArgs(int argc, char * argv[],optionStruct & Option)
    {
    int c;
    Option.SortOutput = false;
    Option.Wformat = NULL;
    OptReturnTp result = GoOn;
    while((c = getopt(argc,argv, opts)) != -1)
        {
        OptReturnTp res = doSwitch(c,optarg,argv[0],Option);
        if(res > result)
            result = res;
        }
    return result;
    }

static void showtime(clock_t t0)
    {
    clock_t t1;
    unsigned long span,sec,msec;
    t1 = clock();
    span = t1 - t0;
    sec = span / CLOCKS_PER_SEC;
/*
    span *= 1000;
    msec = span / CLOCKS_PER_SEC - sec * 1000;
*/
    span -= sec * CLOCKS_PER_SEC;
    span *= 1000;
    msec = span / CLOCKS_PER_SEC/* - sec * 1000*/;
    info("\nTime: %ld.%.3ld\n",sec,msec);
    }


int main(int argc, char * argv[])
    {
    if(argc == 1)
        {
        printf("\n");
        printf("CSTLEMMA version " CSTLEMMAVERSION " (" CSTLEMMADATE ")\n");
        printf("Copyright (C) " CSTLEMMACOPYRIGHT "\n");
// GNU >> 
        printf("CSTLEMMA comes with ABSOLUTELY NO WARRANTY; for details use option -w.\n");
        printf("This is free software, and you are welcome to redistribute it under\n");
        printf("certain conditions; use option -r for details.\n");
        printf("\n\n");
// << GNU
        printf("Use option -h for usage.\n");
        return 0;
        }

    FILE * fpdict = NULL;
    FILE * fpin;
    FILE * fpout;
    FILE * fpflex;
    FILE * fpv= NULL;
    FILE * fpx = NULL;
    FILE * fpz = NULL;
    FILE * ffreq = NULL;
    optionStruct Option;
    clock_t t0;
    t0 = clock();

    OptReturnTp optResult = readArgs(argc,argv,Option);
    if(optResult == Error)
        return 1;

    if(optResult == Leave)
        { // option -r, -w, -? or -h
        return 0;
        }

    if(!Option.argo && !Option.argi)
        DoInfo = false;

    switch(Option.whattodo)
        {
        case MAKEDICT:
            {
            if(!Option.cformat)
                {
                printf("You need to specify a column-order with the -c option\n");
                return -1;
                }
            if(Option.argi)
                {
                fpin = fopen(Option.argi,"r");
                if(!fpin)
                    {
                    printf("Cannot open input file \"%s\" for reading\n",Option.argi);
                    return -1;
                    }
                }
            else
                fpin = stdin;

            if(Option.argo)
                {
                fpout = fopen(Option.argo,"wb");
                if(!fpout)
                    {
                    printf("Cannot open binary dictionary \"%s\" for writing\n",Option.argo);
                    return -1;
                    }
                }
            else
                fpout = stdout;
/*
            if(freq)
                {
                if(!fformat)
                    {
                    printf("Please specify the format of the frequency file with the -n option\n");
                    }
                ffreq = fopen(freq,"r");
                if(!ffreq)
                    {
                    printf("Cannot open frequency file \"%s\" for reading\n",argo);
                    return -1;
                    }
                }
*/
            int ret = makedict(fpin,fpout,Option.nice,Option.cformat,Option.freq,Option.CollapseHomographs);
            if(fpin != stdin)
                fclose(fpin);
            if(fpout != stdout)
                fclose(fpout);
            if(ffreq)
                fclose(ffreq);
            showtime(t0);
            return ret;
            }
        case MAKEFLEXPATTERNS:
            {
            if(!Option.cformat)
                {
                printf("You need to specify a column-order with the -c option\n");
                return -1;
                }


            if(Option.argi)
                {
                fpdict = fopen(Option.argi,"r");
                if(!fpdict)
                    return -1;
                }
            else
                fpdict = stdin;
        
            if(Option.flx)
                {
                fpflex = fopen(Option.flx,"rb");
                if(fpflex)
                    {
                    Flex.readFromFile(fpflex);
                    fclose(fpflex);
                    }
                }

            if(Option.argo)
                {
                fpflex = fopen(Option.argo,"wb");
                if(!fpflex)
                    return -1;
                }
            else
                fpflex = stdout;

            int failed;
            Flex.makeFlexRules(fpdict,fpflex,Option.nice,Option.cformat,failed);

            if(fpdict != stdin)
                fclose(fpdict);

            if(fpflex != stdout)
                fclose(fpflex);

            showtime(t0);
            return 0;
            }
        default:
            {
            /*
            if(!InputHasTags)
                {
                printf("Currently, the program only accepts tagged texts.\n");
                exit(1);
                }
                */
            info("\nFormats:\n");
            if(Option.Iformat)
                {
                info("-I\t%s\tInput format.\n",Option.Iformat);
                }
            if(Default_b_format == Option.bformat && Default_b_format != Option.Bformat)
                Option.bformat = NULL;
            else if(Default_b_format == Option.Bformat && Default_b_format != Option.bformat)
                Option.Bformat = NULL;
            else if(Default_b_format == Option.Bformat && Default_b_format == Option.bformat && Option.Wformat)
                {
                printf("You need to specify -b or -B formats if you specify the -W format\n");
                return -1;
                }
            /*if(!cformat && !Wformat)
                {
//                cformat = "iwbBtTfF"; // TODO THIS IS A HACK
                printf("You need to specify an output cformat with the -c (or -W plus -b or -B) option\n");
                return -1;
                }
            else*/ if((Option.cformat != DefaultCFormat) && Option.Wformat)
                {
//                cformat = "iwbBtTfF"; // TODO THIS IS A HACK
                printf("You cannot specify both -c and -W format options\n");
                return -1;
                }
            else if(Option.cformat && !Option.Wformat)
                {
                if(!Option.InputHasTags && Option.cformat == DefaultCFormat)
                    Option.cformat = DefaultCFormat_NoTags;
                info("-c\t%s\tOutput format\n",Option.cformat);
                if(Option.bformat)
                    info("-b\t%s\tDictionary base form output format.\n",Option.bformat);
                if(Option.Bformat)
                    info("-B\t%s\tComputed base form output format.\n",Option.Bformat);
                Option.listLemmas = 0;
                }
            else
                {
                if(Option.bformat)
                    {
                    info("-b\t%s\tOutput format for data pertaining to the base form, according to the dictionary\n",Option.bformat);
                    Option.listLemmas |= 1;
                    }
                if(Option.Bformat)
                    {
                    Option.listLemmas |= 2;
                    info("-B\t%s\tOutput format for data pertaining to the base form, as predicted by flex pattern rules.\n",Option.Bformat);
                    }
                if(!Option.listLemmas)
                    {
                    printf("You must specify at least one of -b and -B if you do not specify -c.\n");
                    return -1;
                    }
//                format = Wformat;
                if(Option.Wformat)
                    info("-W\t%s\tOutput format for data pertaining to full forms.\n",Option.Wformat);
                }
            bool SortInput;
            if(Option.listLemmas)
                {
                SortInput = basefrm::setFormat(Option.Wformat,Option.bformat,Option.Bformat,Option.InputHasTags);
                if(SortInput)
                    info("Input is sorted before processing (due to $f field in -W<format> argument)\n",Option.flx);
                }
            else
                {
                SortInput = taggedText::setFormat(Option.cformat,Option.bformat,Option.Bformat,Option.InputHasTags);
                if(SortInput)
                    info("Input is sorted before processing (due to $f field in -c<format> argument)\n",Option.flx);
                }
            if(!SortInput)
                {
                if(Option.listLemmas || Option.UseLemmaFreqForDisambiguation < 2)
                    SortInput = true;// performance

                }
                
            info("\nFiles:\n");
            if(Option.flx)
                {
                fpflex = fopen(Option.flx,"rb");
                if(fpflex)
                    {
                    info("-f\t%-20s\tFlexpatterns\n",Option.flx);
                    }
                else
                    {
                    info("-f\t%-20s\t(Flexpatterns): Cannot open file.\n",Option.flx);
                    return -1;
                    }
                }
            else
                {
                printf("-f  Flexpatterns: File not specified.\n");
                return -1;
                }

            if(Option.dictfile)
                {
                fpdict = fopen(Option.dictfile,"rb");
                if(!fpdict)
                    {
                    printf("-d\t%-20s\t(Dictionary): Cannot open file.\n",Option.dictfile);
                    return -1;
                    }
                else
                    info("-d\t%-20s\tDictionary\n",Option.dictfile);
                }
            else
                {
                info("-d\tDictionary: File not specified.\n");
//                return -1;
                }

            if(Option.InputHasTags)
                {
                if(Option.v)
                    {
                    fpv = fopen(Option.v,"r");
                    if(!fpv)
                        {
                        printf("-v\t%-20s\t(Tag friends file): Cannot open file.\n",Option.v);
                        return -1;
                        }
                    else
                        info("-v\t%-20s\t Tag friends file\n",Option.v);
                    }
                else
                    info("-v\tTag friends file: File not specified.\n");

                if(Option.x)
                    {
                    fpx = fopen(Option.x,"r");
                    if(!fpx)
                        {
                        printf("-x\t%-20s\t(Lexical type translation table): Cannot open file.\n",Option.x);
                        return -1;
                        }
                    else
                        info("-x\t%-20s\t Lexical type translation table\n",Option.x);
                    }
                else
                    info("-x\tLexical type translation table: File not specified.\n");
    
                if(Option.z)
                    {
                    fpz = fopen(Option.z,"r");
                    if(!fpz)
                        {
                        printf("-z\t%-20s\t(Full form - Lemma type conversion table): Cannot open file.\n",Option.z);
                        return -1;
                        }
                    else
                        info("-z\t%-20s\tFull form - Lemma type conversion table\n",Option.z);
                    }
                else
                    info("-z\tFull form - Lemma type conversion table: File not specified.\n");
                }
            else
                { // Bart 20021105
                if(Option.z)
                    {
                    fpz = fopen(Option.z,"r");
                    if(!fpz)
                        {
                        printf("-z\t%-20s\t(Full form - Lemma type conversion table): Cannot open file.\n",Option.z);
                        return -1;
                        }
                    else
                        info("-z\t%-20s\tFull form - Lemma type conversion table\n",Option.z);
                    }
                }

            if(Option.argi)
                {
//                fpin = fopen(argi,"rb");
                fpin = fopen(Option.argi,"rt");
                if(!fpin)
                    {
                    printf("-i  %-20s (Input text): Cannot open file.\n",Option.argi);
                    return -1;
                    }
                else
                    info("-i\t%-20s\tInput text\n",Option.argi);
                }
            else
                {
                info("-i\tInput text: Using standard input.\n");
                fpin = stdin;
                }
            if(Option.argo)
                {
                fpout = fopen(Option.argo,"wb");
                if(!fpout)
                    {
                    printf("-o  %-20s (Output text): Cannot open file.\n",Option.argo);
                    return -1;
                    }
                else
                    info("-o\t%-20s\tOutput text\n",Option.argo);
                }
            else
                {
                if(Option.argi)
                    {
                    char buffer[256];
                    Option.argo = buffer;
                    sprintf(buffer,"%s.lemma",Option.argi);
                    fpout = fopen(Option.argo,"wb");
                    if(!fpout)
                        {
                        printf("-o  %-20s (Output text): Cannot open file.\n",Option.argo);
                        return -1;
                        }
                    else
                        info("-o\t%-20s\tOutput text\n",Option.argo);
                    }
                else
                    {
                    DoInfo = false;
                    info("-o\tOutput text: Using standard output.\n");
                    fpout = stdout;
                    }
                }
/*
            if(argn)
                {
                fpnew = fopen(argn,"w");
                if(!fpnew)
                    {
                    printf("-n  %-20s (New words): Cannot open file.\n",argn);
                    return -1;
                    }
                else
                    printf("-n  %-20s New words\n",argn);
                }
            else
                {
                printf("-n  New words are not written to a file\n");
                fpnew = NULL;
                }

            if(argm)
                {
                fphom = fopen(argm,"w");
                if(!fphom)
                    {
                    printf("-m  %-20s (Conflicts): Cannot open file.\n",argm);
                    return -1;
                    }
                else
                    printf("-m  %-20s Conflicts\n",argm);
                }
            else
                {
                printf("-n  Conflicts are not written to a file\n");
                fphom = NULL;
                }
*/
            info("\nSwitches:\n");

            if(Option.InputHasTags)
                info("-t\tInput has tags.\n");
            else
                {
                info("-t-\tInput has no tags.\n");
                if(!Option.Iformat)
                    {
                    if(Option.keepPunctuation == 1)
                        info("-p\tKeep punctuation.\n");
                    else if(Option.keepPunctuation == 2)
                        info("-p+\tTreat punctuation as separate tokens.\n");
                    else
                        info("-p-\tIgnore punctuation.\n");
                    }
                }

            if(!Option.Wformat)
                {
                if(Option.SortOutput)
                    {
                    /*if(Option.Wformat)
                        printf("-q\t(Irrelevant option in combination with -W.)\n");
                    else*/
                        {
                        SortInput = true;
                        info("-q\tSort output.\n");
                        info("Input is sorted before processing (due to option -q)\n",Option.flx);
                        }
                    }
                else
                    {
                    /*if(Option.Wformat)
                        printf("-q\t(Irrelevant option in combination with -W.)\n");
                    else*/
                        info("-q-\tDo not sort output.(default)\n");
                    }
                }

            if(!SortInput)
                info("Input is not sorted before processing (no option -q and no $f field in -c<format> or -W<format> argument)\n",Option.flx);
/*
            if(OutputHasFullForm)
                printf("-a    Output file contains full form of each word.\n");
            else
                printf("-a-   Output file does not contain full form of each word.\n");
*/
            if(!strcmp(Option.Sep,"\t"))
                info("-s\tAmbiguous output is tab-separated\n");
            else if(!strcmp(Option.Sep," "))
                info("-s" commandlineQuote " " commandlineQuote "\tAmbiguous output  is blank-separated\n");
            else if(!strcmp(Option.Sep,DefaultSep))
                info("-s%s\tAmbiguous output is " commandlineQuote "%s" commandlineQuote "-separated (default)\n",Option.Sep,Option.Sep);
            else
                info("-s%s\tAmbiguous output is " commandlineQuote "%s" commandlineQuote "-separated\n",Option.Sep,Option.Sep);

            if(Option.RulesUnique)
                info("-U\tenforce unique flex rules (default)\n");
            else
                info("-U-\tallow ambiguous flex rules\n");

            if(Option.DictUnique)
                info("-u\tenforce unique dictionary look-up (default)\n");
            else
                info("-u-\tallow ambiguous dictionary look-up\n");
            switch(Option.UseLemmaFreqForDisambiguation)
                {
                case 0: info("-H0\tuse lemma frequencies for disambigation (default)\n");
                    basefrm::hasW = true;
                    break;
                case 1: info("-H1\tuse lemma frequencies for disambigation, show pruned lemmas between <<>>\n");
                    basefrm::hasW = true;
                    break;
                case 2: info("-H2\tdon't use lemma frequencies for disambigation\n");break;
                }
            if(Option.baseformsAreLowercase)
                info("-l\tlemmas are forced to lowercase (default)\n");
            else
                info("-l-\tlemmas are same case as full form\n");
            flex::baseformsAreLowercase = Option.baseformsAreLowercase;

            if(Option.size < ULONG_MAX)
                info("-m%lu\tReading max %lu words from input\n",Option.size,Option.size);
            else
                info("-m0\tReading unlimited number of words from input (default).\n");

            if(fpflex)
                {
                Flex.readFromFile(fpflex);
                if(Option.RulesUnique)
                    Flex.removeAmbiguous();
                if(Option.nice)
                    {
                    printf("\n");
                    Flex.print();
                    }
                fclose(fpflex);
                }

            if(fpv)
                {
                TagFriends = new tagpairs(fpv,Option.nice);
/*                if(!readTags(fpx,nice))
                    {
                    fclose(fpx);
                    return -1;
                    }*/
                fclose(fpv);
                }

            if(fpx)
                {
                TextToDictTags = new tagpairs(fpx,Option.nice);
/*                if(!readTags(fpx,nice))
                    {
                    fclose(fpx);
                    return -1;
                    }*/
                fclose(fpx);
                }

            if(fpz)
                {
                if(!readLemmaTags(fpz,Option.nice))
                    {
                    fclose(fpz);
                    return -1;
                    }
                fclose(fpz);
                }

            if(Option.nice && fpdict)
                printf("\nreading dictionary \"%s\"\n",Option.dictfile);
            dictionary dict(fpdict,Option.DictUnique);
            if(fpdict)
                fclose(fpdict);
//            dict.printall(fpout);
//            dict.printall2(fpout);

            unsigned long int totcnt = 0;
            unsigned long int totcntTypes = 0;
            unsigned long int newcnt = 0;
            unsigned long int newcntTypes = 0;
            unsigned long int newhom = 0;
            unsigned long int newhomTypes = 0;
    
            if(Option.nice)
                printf("reading text\n");
            taggedText text(fpin,Option.InputHasTags,Option.listLemmas,Option.Iformat,Option.keepPunctuation,Option.nice,Option.size,Option.treatSlashAsAlternativeSeparator);
            if(fpin != stdin)
                fclose(fpin);

            if(Option.nice)
                printf("processing\n");
            text.unknownWords(fpout,/*fpnew,fphom,*/Option.Sep,
                                     totcnt,totcntTypes,
                                     newcnt,newcntTypes,
                                     newhom,newhomTypes,
                                     SortInput,Option.SortOutput,Option.UseLemmaFreqForDisambiguation,Option.nice
                                     );
            info("\nall words      %10.lu\n"
                     "unknown words  %10.lu (%lu%%)\n"
                     "conflicting    %10.lu (%lu%%)\n\n"
                     ,totcnt
                     ,newcnt,totcnt ? (newcnt*200+1)/(2*totcnt) : 100
                     ,newhom,totcnt ? (newhom*200+1)/(2*totcnt) : 100
                     );
            if(SortInput)
                info("\nall types      %10.lu\n"
                         "unknown types  %10.lu (%lu%%)\n"
                         "conflicting    %10.lu (%lu%%)\n"
                         ,totcntTypes
                         ,newcntTypes,totcntTypes ? (newcntTypes*200+1)/(2*totcntTypes) : 100UL
                         ,newhomTypes,totcntTypes ? (newhomTypes*200+1)/(2*totcntTypes) : 100UL
                         );

        //    Flex.write(fpflex);
//            fclose(fpflex);
            if(fpout != stdout)
                fclose(fpout);
//            fclose(fpnew);
            showtime(t0);
            delete TextToDictTags;
            delete TagFriends;
            for(int i = 0;i < optionSets;++i)
                {
                delete [] Poptions[i];
                delete [] Ppoptions[i];
                }
            delete [] Poptions;
            delete [] Ppoptions;
            return 0;
            }
        }
    }
