/*
CSTLEMMA - trainable lemmatiser using word-end inflectional rules

Copyright (C) 2002, 2004  Center for Sprogteknologi, University of Copenhagen

This file is part of CSTLEMMA.

CSTLEMMA is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.

CSTLEMMA is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with CSTLEMMA; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
*/
#include "text.h"
#include "word.h"
#include "dictionary.h"
#include "basefrm.h"
#include "caseconv.h"
#include <ctype.h>
#include <assert.h>
#include <stdlib.h>

static int cmpBaseforms(basefrm * n1,basefrm * n2)
    {
    int ret = n1->cmpt(n2);
    if(ret)
        return ret;
    ret = n1->cmps(n2);
//    if(ret)
        return ret;
//    return 0;
//    return n1->cmpf(n2);
    }

static int compareBaseforms( const void *arg1, const void *arg2 )
    {
    basefrm * n1 = *(basefrm **)arg1;
    basefrm * n2 = *(basefrm **)arg2;
    return cmpBaseforms(n1,n2);
    }

static int sortBaseforms(basefrm ** pbf,int cnt)
    {
    qsort((void *)pbf,cnt,sizeof(basefrm *),compareBaseforms);
    int i = 0;
    int j = 1;
    int k = 0;
//    int deleted = 0;
    while(j < cnt)
        {
        if(!cmpBaseforms(pbf[i],pbf[j]))
            {
            pbf[j]->getAbsorbedBy(pbf[i]);
            pbf[j] = NULL;
//            ++deleted;
            }
        else
            {
            if(k != i)
                {
                pbf[k] = pbf[i];
                pbf[i] = NULL;
                }
            i = j;
            ++k;
            }
        ++j;
        }
    if(k != i)
        {
        pbf[k] = pbf[i];
        pbf[i] = NULL;
        }
//    assert(k+1 == cnt - deleted);
    return k+1;
    }

class field
    {
    protected:
        field * next;
    public:
        field():next(NULL){}
        virtual ~field(){delete next;};
        virtual char * read(char * kar,field *& nextfield) = 0; // return NULL : kar accepted 
        void addField(field * fld)
            {
            if(next)
                next->addField(fld);
            else
                next = fld;
            }
        virtual void add(char kar){}
        virtual char * getString(){return NULL;}
        virtual void reset(){if(next)next->reset();}
    };

class readValue : public field
    {
    private:
        char * word;
        int len;
        int pos;
    public:
        readValue()
            {
            len = 20;
            word = new char[len];
            word[0] = '\0';
            pos = 0;
            }
        ~readValue(){delete word;}
        void reset()
            {
            pos = 0;
            field::reset();
            }
        char * getString()
            {
//            printf("%s\n",word);
            if(pos == 0)
                return "";
//                word[pos] = '\0';
            return word;
            }
        virtual char * read(char * kar,field *& nextfield)
            {
            char * nxt = kar;
            if(next)
                nxt = next->read(kar,nextfield);
            if(nxt)
                {
                int ln = strlen(nxt);
                if(pos + ln >= len)
                    {
                    len = pos + ln + 1;
                    char * nw = new char[len];
                    strcpy(nw,word);
                    delete word;
                    word = nw;
                    }
                strcpy(word + pos,nxt);
                pos += ln;
                }
/*            else
                word[pos] = '\0';*/
            return NULL; // Value takes all!
            }
    };

class readWhiteSpace : public field
    {
    private:
        bool found;
    public:
        readWhiteSpace():found(false){}
        ~readWhiteSpace(){}
        virtual char * read(char * kar,field *& nextfield)
            {
            int k = *kar;
            if(isspace(k))
                {
                found = true;
                nextfield = this;
                return NULL;
                }
            else if(found)
                {
                if(next)
                    next->read(kar,nextfield);
             //   nextfield = next;
                assert(nextfield != this);
                return NULL;
                }
            else
                return kar; // add to previous
            }
        void reset()
            {
            found = false;
            field::reset();
            }
    };

class readAllButWhiteSpace : public field
    {
    private:
        bool found;
    public:
        readAllButWhiteSpace():found(false){}
        ~readAllButWhiteSpace(){}
        virtual char * read(char * kar,field *& nextfield)
            {
            int k = *kar;
            if(isspace(k))
                {
                if(found)
                    {
                    if(next)
                        next->read(kar,nextfield);
//                    nextfield = next;
                    return NULL;
                    }
                else
                    return kar;
                }
            else
                {
                found = true;
                nextfield = this;
                return NULL;
                }
            }
        void reset()
            {
            found = false;
            field::reset();
            }
    };

class readTab : public field
    {
    public:
        readTab(){}
        ~readTab(){}
        virtual char * read(char * kar,field *& nextfield)
            {
//            int k = *kar;
            if(*kar == '\t')
                {
                nextfield = next;
                return NULL;
                }
            else
                return kar;
            }
    };

class readNewLine : public field
    {
    public:
        readNewLine(){}
        ~readNewLine(){}
        virtual char * read(char * kar,field *& nextfield)
            {
//            int k = *kar;
            if(*kar == '\n')
                {
                nextfield = next;
                return NULL;
                }
            else
                return kar;
            }
    };

class readLitteral : public field
    {
    char * litteral;
    char * matched;
    char * giveback;
    int len;
    int pos;
    int givebacklen;
    public:
        readLitteral(char first)
            {
            giveback = NULL;
            givebacklen = 0;
            litteral = new char[2];
            litteral[0] = first;
            litteral[1] = '\0';
            matched = new char[2];
            len = 2;
            pos = 0;
            }
        ~readLitteral()
            {
            delete litteral;
            delete giveback;
            }
        void add(char kar)
            {
            ++len;
            char * nw = new char[len];
            delete matched;
            matched = new char[len];
            strcpy(nw,litteral);
            nw[len-2] = kar;
            nw[len-1] = '\0';
            delete litteral;
            litteral = nw;
            }
        virtual char * read(char * kar,field *& nextfield)
            {
            if(pos == len - 1) // all matched
                {
//                nextfield = next;
                if(next)
                    next->read(kar,nextfield);
                return NULL;
                }
            else
                {
//                int k = *kar;
                matched[pos] = *kar;
                if(*kar == litteral[pos])
                    {
                    pos++;
                    return NULL;
                    }
                else
                    {
                    int j;
                    int l = pos;
                    for(j = 1;l;++j,--l)
                        if(!strncmp(litteral,matched+j,l))
                            {
                            if(givebacklen <= j)
                                {
                                delete giveback;
                                giveback = new char[j+1];
                                }
                            int k;
                            for(k = 0;k < j;++k)
                                giveback[k] = matched[k];
                            giveback[k] = '\0';
                            for(;k <= l;++k)
                                matched[k-j] = matched[k];
                            matched[k-j] = '\0';
                            pos = pos - j + 1;
                            return giveback;
                            }
                    matched[pos+1] = '\0';
                    pos = 0;
                    return matched;
                    }
                }
            }
        void reset()
            {
            pos = 0;
            field::reset();
            }
    };

static field * fields = NULL;
static void AddField(field * fld)
    {
    if(fields == NULL)
        fields = fld;
    else
        fields->addField(fld);
    }

static char * globIformat = NULL;
static field * translateFormat(char * Iformat,field *& wordfield,field *& tagfield)
    {
    globIformat = Iformat;
    bool escape = false;
    bool afield = false;
    char * pformat;
    field * litteral = NULL;
    for(pformat = Iformat;*pformat;++pformat)
        {
        if(afield)
            {
            afield = false;
            switch(*pformat)
                {
                case 'w':
                    if(wordfield)
                        {
                        printf("Invalid format string \"%s\"\n",Iformat);
                        printf("                        %*c\n",(int)(strlen(Iformat) - strlen(pformat)),'^');
                        exit(0);
                        }
                    wordfield = new readValue();
                    litteral = NULL;
                    AddField(wordfield);
                    break;
                case 't':
                    if(tagfield)
                        {
                        printf("Invalid format string \"%s\"\n",Iformat);
                        printf("                        %*c\n",(int)(strlen(Iformat) - strlen(pformat)),'^');
                        exit(0);
                        }
                    tagfield = new readValue();
                    litteral = NULL;
                    AddField(tagfield);
                    break;
                case 'd':
                    litteral = NULL;
                    AddField(new readValue());
                    break;
                default:
                    {
                    printf("Invalid format string \"%s\"\n",Iformat);
                    printf("                        %*c\n",(int)(strlen(Iformat) - strlen(pformat)),'^');
                    exit(0);
                    }
                }
            }
        else if(escape)
            {
            escape = false;
            switch(*pformat)
                {
                case 's':
                    litteral = NULL;
                    AddField(new readWhiteSpace);
                    break;
                case 'S':
                    litteral = NULL;
                    AddField(new readAllButWhiteSpace);
                    break;
                case 't':
                    litteral = NULL;
                    AddField(new readTab);
                    break;
                case 'n':
                    litteral = NULL;
                    AddField(new readNewLine);
                    break;
                default:
                    {
                    printf("Invalid format string \"%s\"\n",Iformat);
                    printf("                        %*c\n",(int)(strlen(Iformat) - strlen(pformat)),'^');
                    exit(0);
                    }
                }
            }
        else if(*pformat == '\\')
            {
            escape = true;
            }
        else if(*pformat == '$')
            {
            afield = true;
            }
        else
            {
            if(!litteral)
                {
                litteral = new readLitteral(*pformat);
                AddField(litteral);
                }
            else
                litteral->add(*pformat);
            }
        }
    return fields;
    }

static int sanityCheck(int slashFound,const char * buf)
    {
    if(!slashFound)
        return 0;
    if(*buf == '/')
        return 0; // list of alternatives does not start with "/"
    const char * p = buf;
    while(  isAlpha(*p) // list of alternatives does only have alphabetic characters
         ||    *p == '/' 
            && *(p - 1) != '/' // list of alternatives does not contain "//"
         ) 
        ++p;
    if(*p)
        return 0;
    if(*(p - 1) == '/')
        return 0; // list of alternatives does not end with "/"
    return slashFound;
    }

int findSlashes(const char * buf)
    {
    if(!*buf || *buf == '/')
        return 0;
    const char * p = buf;
    int ret = 0;
    while((p = strchr(p,'/')) != NULL)
        {
        ++p;
        if(*p == '/')
            return 0;
        ++ret;
        }
    if(ret && buf[strlen(buf + 1)] == '/')
        return 0;
    return ret;
    }

static char * getwordI(FILE * fpin,char *& tag,field * format,field * wordfield,field * tagfield,unsigned long & lineno)
    {
    format->reset();
    assert(wordfield);
    int kar = EOF;
    char kars[2];
    kars[1] = '\0';
    char line[256];
    int i = 0;
    field * nextfield = format;
    while(nextfield && (kar = fgetc(fpin)) != EOF)
        {
        if(kar == '\n')
            ++lineno;
        kars[0] = (char)kar;
        if(i == sizeof(line) - 1)
            {
            printf("Unable to apply input format specification to this text.\n");
            printf("Input format:'%s'\n",globIformat);
            printf("Text:'%s...'\n",line);
            exit(0);
            }
        line[i++] = (char)kar;
        line[i] = '\0';
        nextfield->read(kars,nextfield);
        }
    if(kar == EOF)
        {
        tag = NULL;
        format->reset();
        return NULL;
        }
    if(tagfield)
        tag = tagfield->getString();
//    format->reset();
    return wordfield->getString();
    }

static char * getword(FILE * fp,char *& tag,bool InputHasTags,int keepPunctuation, int & slashFound, unsigned long & lineno)
    {
    static int punct = 0;
    static char buf[1000];
    static char buf2[256]; // tag
    static int eof = false;
    slashFound = 0;
    if(punct)
        {
        buf[0] = (char)punct;
        buf[1] = '\0';
        punct = 0;
        return buf;
        }
    int kar;
    char *p;
    p = buf;
    tag = NULL;
    if(eof)
        {
        eof = false;
        return NULL;
        }
    while(true)
        {
        kar = fgetc(fp);
        if(kar == EOF)
            {
            *p = '\0';
            eof = true;
            slashFound = sanityCheck(slashFound,buf);
            return buf;
            }
        if(InputHasTags)
            {
            if(kar == '/')
                {// tag follows
			    // TODO: How is a slash in the original text tagged? Answer: as a slash
                char * slash = p; // slash points at / in buf (the word)
                *p = '\0';        // the / is replaced by zero
                p = buf2;         // p points at the start of the tag buffer
                tag = buf2;
                while(true)
                    {
                    kar = fgetc(fp);
                    if(kar == EOF)
                        {
                        *p = '\0';
                        eof = true;
                        break;
                        }
                    if(isSpace(kar))
                        {
                        if(kar == '\n')
                            ++lineno;
                        if(p > buf2/*Bart 20030225. Sometimes slash and tag are on different lines*/)
                            {
                            *p = '\0';
                            break;
                            }
                        else
                            continue;
                        }
                    if(kar == '/') // oops, word contains slash
                        {
                        ++slashFound; // Bart 20030801. Token may need special treatment as "/"-separated alternatives.
                        *slash = '/'; // put the slash back into the word (somewhere in buf)
                        *p = '\0';    // prepare buf2 for being copied back to buf
                        strcpy(slash+1,buf2); // do the copying
                        slash += p - buf2 + 1; // let slash point at end of buf (the nul byte)
                        p = buf2; // let p start again at start of buf2
                        }
                    else
                        {
                        if(p - buf2 == sizeof(buf2) - 1)
                            {
                            *p = '\0';
                            printf("BUFFER OVERFLOW A [%s]\n",buf2);
                            break;
                            }
                        *p++ = (char)kar;
                        }
                    }
                slashFound = sanityCheck(slashFound,buf);
                return buf;
                }
            }
        else if(  keepPunctuation != 1
               && p > buf /*Bart 20030225 + 1*/
               && ispunct(kar) 
		       && kar != '-' /*gr-det-selv*/ 
		       && kar != '\'' /*bli'r*/
		       )
            {
            if(keepPunctuation == 0)
                {
                *p = '\0';
                slashFound = sanityCheck(slashFound,buf);
                return buf;
                }
            else //if(keepPunctuation == 2)
                {
                punct = kar;
                *p = '\0';
                slashFound = sanityCheck(slashFound,buf);
                return buf;
                }
            }
        if(isSpace(kar))
            {
            if(kar == '\n')
                ++lineno;
            *p = '\0';
            slashFound = sanityCheck(slashFound,buf);
            return buf;
            }
        if(p - buf == sizeof(buf) - 1)
            {
            *p = '\0';
            printf("BUFFER OVERFLOW B [%s]\n",buf);
            slashFound = sanityCheck(slashFound,buf);
            return buf; // overflow?
            }
        if(kar == '/')
            ++slashFound; // Bart 20030801.
        *p++ = (char)kar;
        }
    }

#if QSORT

static int cmpTagged(taggedWord * n1,taggedWord * n2)
    {
    int ret = n1->cmptag(n2);
    if(ret)
        return ret;
    return n1->cmpword(n2);
    }


static int compareTaggedWords( const void *arg1, const void *arg2 )
    {
    taggedWord * n1 = *(taggedWord **)arg1;
    taggedWord * n2 = *(taggedWord **)arg2;
    assert(n1);
    assert(n2);
    return cmpTagged(n1,n2);
    }


static int cmpUnTagged(unTaggedWord * n1,unTaggedWord * n2)
    {
    return n1->cmpword(n2);
    }


static int compareUnTaggedWords( const void *arg1, const void *arg2 )
    {
    unTaggedWord * n1 = *(unTaggedWord **)arg1;
    unTaggedWord * n2 = *(unTaggedWord **)arg2;
    return cmpUnTagged(n1,n2);
    }



#else
#endif




void taggedText::unknownWords(FILE * fpo,/*FILE * fpnew,FILE * fpconflict,*/const char * Sep,
                             unsigned long int & totcnt,unsigned long int & totcntTypes,
                             unsigned long int & newcnt,unsigned long int & newcntTypes,
                             unsigned long int & aConflict,unsigned long int & aConflictTypes,bool SortInput,bool SortOutput,
                             int UseLemmaFreqForDisambiguation,bool nice)
    {
    baseformpointer::UseLemmaFreqForDisambiguation = UseLemmaFreqForDisambiguation;
#if QSORT
    if(SortInput)
        {
        if(nice)
            printf("sorting input\n");
        sort();
        if(nice)
            printf("...sorted\n");
        }
#endif
    taggedWord::sep = Sep;
    basefrm::sep = Sep;
//    int cnt = 0;
#if QSORT
    aConflict = aConflictTypes = newcnt = newcntTypes = 0;
#else
    this->aConflict = this->aConflictTypes = this->newcnt = this->newcntTypes = 0;
#endif
    totcnt = total;
    totcntTypes = reducedtotal;
    unsigned long int k;
#if QSORT
    int cntD = 0;
    int cntL = 0;
#else
    cntD = 0;
    cntL = 0;
#endif
    if(nice)
        printf("looking up words\n");
#if QSORT
    for(k = 0;k < reducedtotal;++k)
        {
        bool conflict = false;
        tcount Pos;
        int Nmbr;
        unTaggedWord * word_k = u.unTaggedWords[k];
        if(dictionary::findword(word_k->itsWord(),Pos,Nmbr))
            {
            word_k->addBaseFormsDL(LEXT + Pos,Nmbr,conflict,cntD,cntL);
//            files.write(LEXT + Pos,Nmbr,TaggedWords[k]->word,TaggedWords[k]->tag,conflict);
            if(conflict)
                {
                aConflictTypes++;
                aConflict += word_k->itsCnt();
                }
            }
        else
            {
            newcntTypes++;
            newcnt += word_k->itsCnt();
            cntL += word_k->addBaseFormsL();
//            files.writeWordAndTag(TaggedWords[k]->word,TaggedWords[k]->tag,text.Sorted() ? TaggedWords[k]->cnt:-1);
            }
        if(basefrm::hasW)
            word_k->addFullForm();
        }
#else
    if(Root)
        Root->traverse(&unTaggedWord::lookup,this);
    aConflict = this->aConflict;
    aConflictTypes = this->aConflictTypes;
    newcnt = this->newcnt;
    newcntTypes = this->newcntTypes;
#endif
    basefrmarrD = new basefrm * [cntD];
    basefrmarrL = new basefrm * [cntL];
#if QSORT
    basefrm ** D = &basefrmarrD[0];
    basefrm ** L = &basefrmarrL[0];
    for(k = 0;k < reducedtotal;++k)
        {
        u.unTaggedWords[k]->assignTo(D,L);
        }
#else
    D = &basefrmarrD[0];
    L = &basefrmarrL[0];
    if(Root)
        {
        Root->traverse(&unTaggedWord::assign,this);
        }
#endif
    assert(cntD == D - &basefrmarrD[0]);
    assert(cntL == L - &basefrmarrL[0]);
    /*int reducedCntD =*/ sortBaseforms(basefrmarrD,cntD);
    /*int reducedCntL =*/ sortBaseforms(basefrmarrL,cntL);
    /*
    rewind(fpo);
    */

    if(UseLemmaFreqForDisambiguation != 2 /*Why?-> && lext::DictUnique*/)
        {
        if(nice)
            printf("disambiguation by lemma frequency\n");
#if QSORT
        for(k = 0;k < reducedtotal;++k)
            {
            u.unTaggedWords[k]->DissambiguateByLemmaFrequency();
            }
        // Bart 20021105: correct frequencies
        
        for(k = 0;k < reducedtotal;++k)
            {
            u.unTaggedWords[k]->decFreq();
            }
#else
        if(Root)
            {
            Root->traverse0(&unTaggedWord::DissambiguateByLemmaFrequency);
            Root->traverse0(&unTaggedWord::decFreq);
            }
#endif            
        if(nice)
            printf("...disambiguated by lemma frequency\n");
        }
    if(TagFriends && InputHasTags)
        {
        if(nice)
            printf("disambiguation by tag friends\n");
//        printf("DissambiguateByTagFriends\n");
#if QSORT
        for(k = 0;k < reducedtotal;++k)
            {
            u.TaggedWords[k]->DissambiguateByTagFriends();
            }
#else
        if(Root)
            {
            ((taggedWord*)Root)->traverse0T(&taggedWord::DissambiguateByTagFriends);
            }
#endif            
        if(nice)
            printf("...disambiguated by tag friends\n");
        }

    unsorted[0]->setFile(fpo);
    if(listLemmas)
        {
        if(nice)
            printf("listing lemmas\n");
        int k;
        basefrmarrD[0]->setFile(fpo);
        if(listLemmas & 1 && listLemmas & 2)
            {
            int d = 0;
            int l = 0;
            while(d < cntD && basefrmarrD[d] && l < cntL && basefrmarrL[l])
                {
                if(cmpBaseforms(basefrmarrD[d],basefrmarrL[l]) < 0)
                    {
                    if(basefrmarrD[d]->lemmaFreq())
                        basefrmarrD[d]->printb();
                    d++;
                    }
                else
                    {
                    if(basefrmarrL[l]->lemmaFreq())
                        basefrmarrL[l]->printB();
                    l++;
                    }
                }
            while(d < cntD && basefrmarrD[d])
                {
                if(basefrmarrD[d]->lemmaFreq())
                    basefrmarrD[d]->printb();
                d++;
                }
            while(l < cntL && basefrmarrL[l])
                {
                if(basefrmarrL[l]->lemmaFreq())
                    basefrmarrL[l]->printB();
                l++;
                }
            }
        else if(listLemmas & 1)
            {
            for(k = 0;k < cntD && basefrmarrD[k];++k)
                basefrmarrD[k]->printb();
            }
        else if(listLemmas & 2)
            {
            for(k = 0;k < cntL && basefrmarrL[k];++k)
                basefrmarrL[k]->printB();
            }
        if(nice)
            printf("...listed lemmas\n");
        }
    else
        {
    //    unsorted[0]->setFile(fpo);
        if(nice)
            printf("listing words\n");
        if(SortOutput)
            {
#if QSORT
            for(k = 0;k < reducedtotal;++k)
                {
                u.unTaggedWords[k]->print(/*fpnew*/);
                }
#else
            if(Root)
                {
                Root->traverse0C(&unTaggedWord::print);
                }
#endif
            }
        else
            {
            for(k = 0;k < total;++k)
                {
                if(unsorted[k])
                    unsorted[k]->print(/*fpo*/);
                }
            }
        if(nice)
            printf("...listed words\n");
        }
#if 0
    unsorted[0]->setFile(fpnew);
    for(k = 0;k < reducedtotal;++k)
        {
        u.unTaggedWords[k]->printnew(/*fpnew*/);
        }
    unsorted[0]->setFile(fpconflict);
    for(k = 0;k < reducedtotal;++k)
        {
        u.unTaggedWords[k]->printConflict(/*fpconflict*/);
        }
#endif
//    totcnt = k;
//    return cnt;
    if(nice)
        printf("...text processed\n");
    }

void taggedText::createUnTaggedAlternatives(char * w)
    {
    while(w && *w)
        {
        char * slash = strchr(w,'/');
        if(slash)
            *slash = '\0';
#if QSORT
        unsorted[total] = u.unTaggedWords[total] = new unTaggedWord(w,unsorted + total);
#else
        if(Root)
            {
            unsorted[total] = Root->insert(w);
            }
        else
            {
            unTaggedWord * wrd = new unTaggedWord(w);
            unsorted[total] = wrd;
            Root = wrd;
            }
#endif
        ++total;
        if(slash)
            {
            *slash = '/';
            w = slash + 1;
            }
        else
            w = NULL;
        }
    }

void taggedText::createUnTagged(char * w)
    {
    if(*w)
        {
#if QSORT
        unsorted[total] = u.unTaggedWords[total] = new unTaggedWord(w,unsorted + total);
#else
        if(Root)
            {
            unsorted[total] = Root->insert(w);
            }
        else
            {
            unTaggedWord * wrd = new unTaggedWord(w);
            unsorted[total] = wrd;
            Root = wrd;
            }
#endif
        ++total;
        }
    }

void taggedText::createTaggedAlternatives(char * w, const char * tag)
    {
    while(w && *w)
        {
        char * slash = strchr(w,'/');
        if(slash)
            {
            *slash = '\0';
            }
#if QSORT
        unsorted[total] = u.TaggedWords[total] = new taggedWord(w,tag,unsorted + total);
#else
        if(Root)
            {
            unsorted[total] = ((taggedWord*)Root)->insert(w,tag);
            }
        else
            {
            taggedWord * wrd = new taggedWord(w,tag);
            Root = wrd;
            unsorted[total] = wrd;
            }
#endif
        ++total;
        if(slash)
            {
            *slash = '/';
            w = slash + 1;
            }
        else
            w = NULL;
        }
    }

void taggedText::createTagged(char * w, const char * tag)
    {
#if QSORT
    unsorted[total] = u.TaggedWords[total] = new taggedWord(w,tag,unsorted + total);
#else
    if(Root)
        {
        unsorted[total] = ((taggedWord*)Root)->insert(w,tag);
        }
    else
        {
        taggedWord * wrd = new taggedWord(w,tag);
        Root = wrd;
        unsorted[total] = wrd;
        }
#endif
    ++total;
    }

taggedText::taggedText(FILE * fpi,bool InputHasTags,int listLemmas,char * Iformat,int keepPunctuation,bool nice,
                             unsigned long int size,bool treatSlashAsAlternativeSeparator)
    :InputHasTags(InputHasTags),listLemmas(listLemmas)
    {
#if !QSORT
    reducedtotal = 0;
    Root = NULL;
#endif
    unsigned long lineno = 1;
    basefrmarrD = 0;
    basefrmarrL = 0;

    char * w;
    total = 0;
    char * tag;
    field * wordfield;
    field * tagfield;
    field * format = NULL;
    int slashFound = 0;
    if(nice)
        printf("counting words\n");
    if(Iformat)
        {
        wordfield = NULL;
        tagfield = NULL;
        format = translateFormat(Iformat,wordfield,tagfield);
        if(!wordfield)
            {
            printf("Input format %s must specify '$w'.\n",Iformat);
            exit(0);
            }
        lineno = 1;
        while(total < size && (w = getwordI(fpi,tag,format,wordfield,tagfield,lineno)) != NULL)
            {
            if(*w)
                {
                ++total;
                if(treatSlashAsAlternativeSeparator)
                    {
                    total += findSlashes(w);
                    }
                }
            }
        }
    else
        {
        lineno = 1;
        while(total < size && (w = getword(fpi,tag,InputHasTags,keepPunctuation,slashFound,lineno)) != NULL)
            {
            if(*w)
                {
                ++total;
                if(slashFound && treatSlashAsAlternativeSeparator)
                    total += slashFound;
                }
            }
        }
    if(nice)
        {
        printf("... %lu words counted\n",total);
        }
    rewind(fpi);
    if(nice)
        printf("allocating array of pointers to words\n");
    unsorted =  new const unTaggedWord * [total];
    if(nice)
        printf("...allocated array\n");
    if(!InputHasTags)
        {
#if QSORT
        if(nice)
            printf("allocating array for untagged words\n");
        u.unTaggedWords = new unTaggedWord * [total];
        if(nice)
            printf("...allocated array for untagged words\n");
#endif
        total = 0;
        if(nice)
            printf("reading words\n");
        if(format)
            {
            lineno = 1;
            while(total < size && (w = getwordI(fpi,tag,format,wordfield,tagfield,lineno)) != NULL)
                {
                if(treatSlashAsAlternativeSeparator && findSlashes(w))
                    createUnTaggedAlternatives(w);
                else
                    createUnTagged(w);
                }
            }
        else
            {
            lineno = 1;
            while(total < size && (w = getword(fpi,tag,false,keepPunctuation,slashFound,lineno)) != NULL)
                {
                if(slashFound && treatSlashAsAlternativeSeparator)
                    createUnTaggedAlternatives(w);
                else 
                    createUnTagged(w);
                }
            }
        if(nice)
            printf("...read words\n");
        }
    else
        {
#if QSORT
        if(nice)
            printf("allocating array for tagged words\n");
        u.TaggedWords = new taggedWord * [total];
        if(nice)
            printf("...allocated array for tagged words\n");
#endif
        total = 0;
        if(nice)
            printf("reading words\n");
        if(format)
            {
            lineno = 1;
            while(total < size && (w = getwordI(fpi,tag,format,wordfield,tagfield,lineno)) != NULL)
                {
                if(treatSlashAsAlternativeSeparator && findSlashes(w))
                    createTaggedAlternatives(w,tag);
                else if(*w)
                    createTagged(w,tag);
                }
            }
        else
            {
            lineno = 1;
            while(total < size && (w = getword(fpi,tag,true,true,slashFound,lineno)) != NULL)
                {
                if(*w)
                    {
                    if(!tag)
                        {
                        if(total > 1 && lineno > 1)
                            printf("Tag missing in word #%lu (\"%s\") (line #%lu).\n",total,w,lineno);
                        else
                            printf("Tag missing in word #%lu (\"%s\") (line #%lu). (Is the input text tagged?)\n",total,w,lineno);
                        exit(0);
                        }
                    if(slashFound && treatSlashAsAlternativeSeparator)
                        createTaggedAlternatives(w,tag);
                    else 
                        createTagged(w,tag);
                    }
                }
#if !QSORT
            reducedtotal = unTaggedWord::reducedtotal;
#endif
            }
        if(nice)
            printf("...read words\n");
        }
    rewind(fpi);
    sorted = false;
#if QSORT
    reducedtotal = total;
#endif
    }

taggedText::~taggedText()
    {
    delete [] basefrmarrD;
    delete [] basefrmarrL;
#if QSORT
    delete [] u.unTaggedWords;
#else
    delete Root;
#endif
    }


#if QSORT
void taggedText::sort()
    {
    int i = 0;
    int j = 1;
    int k = 0;
    if(InputHasTags)
        {
        qsort( (void *)u.TaggedWords, total, sizeof( taggedWord * ), compareTaggedWords);
    //    int deleted = 0;
        while(j < total)
            {
            if(!cmpTagged(u.TaggedWords[i],u.TaggedWords[j]))
                {
                u.TaggedWords[i]->addCnt(u.TaggedWords[j]);
                u.TaggedWords[j]->setUnsorted(u.TaggedWords[i]);
                delete u.TaggedWords[j];
    //            ++deleted;
                u.TaggedWords[j] = NULL;
                }
            else
                {
                if(k != i)
                    {
                    u.TaggedWords[k] = u.TaggedWords[i];
                    u.TaggedWords[i]->setUnsorted(u.TaggedWords[k]);
                    u.TaggedWords[i] = NULL;
                    }
                i = j;
                ++k;
                }
            ++j;
            }
        }
    else
        {
        qsort( (void *)u.unTaggedWords, total, sizeof( unTaggedWord * ), compareUnTaggedWords);
    //    int deleted = 0;
        while(j < total)
            {
            if(!cmpUnTagged(u.unTaggedWords[i],u.unTaggedWords[j]))
                {
                u.unTaggedWords[i]->addCnt(u.unTaggedWords[j]);
                u.unTaggedWords[j]->setUnsorted(u.unTaggedWords[i]);
                delete u.unTaggedWords[j];
    //            ++deleted;
                u.unTaggedWords[j] = NULL;
                }
            else
                {
                if(k != i)
                    {
                    u.unTaggedWords[k] = u.unTaggedWords[i];
                    u.unTaggedWords[i]->setUnsorted(u.unTaggedWords[k]);
                    u.unTaggedWords[i] = NULL;
                    }
                i = j;
                ++k;
                }
            ++j;
            }
        }
    if(k != i)
        {
        u.unTaggedWords[k] = u.unTaggedWords[i];
        u.unTaggedWords[i]->setUnsorted(u.unTaggedWords[k]);
        u.unTaggedWords[i] = NULL;
        }
    if(total > 0)
        reducedtotal = k+1;
//    assert(reducedtotal == total - deleted);
    sorted = true;
    }
#endif

bool taggedText::setFormat(const char * cformat,const char * bformat,const char * Bformat,bool InputHasTags)
    {
    return unTaggedWord::setFormat(cformat,bformat,Bformat,InputHasTags);
    }
