/* CSTLEMMA - trainable lemmatiser Copyright (C) 2002, 2005, 2009 Center for Sprogteknologi, University of Copenhagen This file is part of CSTLEMMA. CSTLEMMA is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. CSTLEMMA is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with CSTLEMMA; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include "letterfunc.h" #include #if UNICODE_CAPABLE #include "letter.h" bool Turcic = false; #if 0 bool isAlpha(int kar) { /* int ind = kar % ARRSIZE; if((int)Letter[ind].Unfolded == kar) return true;*/ /* From http://www.w3.org/TR/xml11/#NT-NameStartChar: NameStartChar ::= ":" | [A-Z] | "_" | [a-z] | [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#x2FF] | [#x370-#x37D] | [#x37F-#x1FFF] | [#x200C-#x200D] | [#x2070-#x218F] | [#x2C00-#x2FEF] | [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] | [#x10000-#xEFFFF] */ // 20090702 return 'A' <= kar && kar <= 'Z' || 'a' <= kar && kar <= 'z' || 0xC0 <= kar && kar <= 0xD6 || 0xD8 <= kar && kar <= 0xF6 || 0xF8 <= kar && kar <= 0x2FF || 0x370 <= kar && kar <= 0x37D || 0x37F <= kar && kar <= 0x1FFF || 0x200C <= kar && kar <= 0x200D || 0x2070 <= kar && kar <= 0x218F || 0x2C00 <= kar && kar <= 0x2FEF || 0x3001 <= kar && kar <= 0xD7FF || 0xF900 <= kar && kar <= 0xFDCF || 0xFDF0 <= kar && kar <= 0xFFFD || 0x10000 <= kar && kar <= 0xEFFFF; //return kar > 255; // Assume Unicode positions outside Latin-1 is all alphabetic } #else // Based on UnicodeData.txt 17-Aug-2009 struct cletter {int L:20;int range:12;}; struct cletter Cletters[]={{0x41,25},{0x61,25}, {0xAA,0},{0xB5,0},{0xBA,0},{0xC0,22},{0xD8,30},{0xF8,457},{0x2C6,11},{0x2E0,4}, {0x2EC,0},{0x2EE,0},{0x370,4},{0x376,1},{0x37A,3},{0x386,0},{0x388,2},{0x38C,0}, {0x38E,19},{0x3A3,82},{0x3F7,138},{0x48A,155},{0x531,37},{0x559,0},{0x561,38},{0x5D0,26}, {0x5F0,2},{0x621,41},{0x66E,1},{0x671,98},{0x6D5,0},{0x6E5,1},{0x6EE,1},{0x6FA,2}, {0x6FF,0},{0x710,0},{0x712,29},{0x74D,88},{0x7B1,0},{0x7CA,32},{0x7F4,1},{0x7FA,0}, {0x800,21},{0x81A,0},{0x824,0},{0x828,0},{0x904,53},{0x93D,0},{0x950,0},{0x958,9}, {0x971,1},{0x979,6},{0x985,7},{0x98F,1},{0x993,21},{0x9AA,6},{0x9B2,0},{0x9B6,3}, {0x9BD,0},{0x9CE,0},{0x9DC,1},{0x9DF,2},{0x9F0,1},{0xA05,5},{0xA0F,1},{0xA13,21}, {0xA2A,6},{0xA32,1},{0xA35,1},{0xA38,1},{0xA59,3},{0xA5E,0},{0xA72,2},{0xA85,8}, {0xA8F,2},{0xA93,21},{0xAAA,6},{0xAB2,1},{0xAB5,4},{0xABD,0},{0xAD0,0},{0xAE0,1}, {0xB05,7},{0xB0F,1},{0xB13,21},{0xB2A,6},{0xB32,1},{0xB35,4},{0xB3D,0},{0xB5C,1}, {0xB5F,2},{0xB71,0},{0xB83,0},{0xB85,5},{0xB8E,2},{0xB92,3},{0xB99,1},{0xB9C,0}, {0xB9E,1},{0xBA3,1},{0xBA8,2},{0xBAE,11},{0xBD0,0},{0xC05,7},{0xC0E,2},{0xC12,22}, {0xC2A,9},{0xC35,4},{0xC3D,0},{0xC58,1},{0xC60,1},{0xC85,7},{0xC8E,2},{0xC92,22}, {0xCAA,9},{0xCB5,4},{0xCBD,0},{0xCDE,0},{0xCE0,1},{0xD05,7},{0xD0E,2},{0xD12,22}, {0xD2A,15},{0xD3D,0},{0xD60,1},{0xD7A,5},{0xD85,17},{0xD9A,23},{0xDB3,8},{0xDBD,0}, {0xDC0,6},{0xE01,47},{0xE32,1},{0xE40,6},{0xE81,1},{0xE84,0},{0xE87,1},{0xE8A,0}, {0xE8D,0},{0xE94,3},{0xE99,6},{0xEA1,2},{0xEA5,0},{0xEA7,0},{0xEAA,1},{0xEAD,3}, {0xEB2,1},{0xEBD,0},{0xEC0,4},{0xEC6,0},{0xEDC,1},{0xF00,0},{0xF40,7},{0xF49,35}, {0xF88,3},{0x1000,42},{0x103F,0},{0x1050,5},{0x105A,3},{0x1061,0},{0x1065,1},{0x106E,2}, {0x1075,12},{0x108E,0},{0x10A0,37},{0x10D0,42},{0x10FC,0},{0x1100,328},{0x124A,3},{0x1250,6}, {0x1258,0},{0x125A,3},{0x1260,40},{0x128A,3},{0x1290,32},{0x12B2,3},{0x12B8,6},{0x12C0,0}, {0x12C2,3},{0x12C8,14},{0x12D8,56},{0x1312,3},{0x1318,66},{0x1380,15},{0x13A0,84},{0x1401,619}, {0x166F,16},{0x1681,25},{0x16A0,74},{0x1700,12},{0x170E,3},{0x1720,17},{0x1740,17},{0x1760,12}, {0x176E,2},{0x1780,51},{0x17D7,0},{0x17DC,0},{0x1820,87},{0x1880,40},{0x18AA,0},{0x18B0,69}, {0x1900,28},{0x1950,29},{0x1970,4},{0x1980,43},{0x19C1,6},{0x1A00,22},{0x1A20,52},{0x1AA7,0}, {0x1B05,46},{0x1B45,6},{0x1B83,29},{0x1BAE,1},{0x1C00,35},{0x1C4D,2},{0x1C5A,35},{0x1CE9,3}, {0x1CEE,3},{0x1D00,191},{0x1E00,277},{0x1F18,5},{0x1F20,37},{0x1F48,5},{0x1F50,7},{0x1F59,0}, {0x1F5B,0},{0x1F5D,0},{0x1F5F,30},{0x1F80,52},{0x1FB6,6},{0x1FBE,0},{0x1FC2,2},{0x1FC6,6}, {0x1FD0,3},{0x1FD6,5},{0x1FE0,12},{0x1FF2,2},{0x1FF6,6},{0x2071,0},{0x207F,0},{0x2090,4}, {0x2102,0},{0x2107,0},{0x210A,9},{0x2115,0},{0x2119,4},{0x2124,0},{0x2126,0},{0x2128,0}, {0x212A,3},{0x212F,10},{0x213C,3},{0x2145,4},{0x214E,0},{0x2183,1},{0x2C00,46},{0x2C30,46}, {0x2C60,132},{0x2CEB,3},{0x2D00,37},{0x2D30,53},{0x2D6F,0},{0x2D80,22},{0x2DA0,6},{0x2DA8,6}, {0x2DB0,6},{0x2DB8,6},{0x2DC0,6},{0x2DC8,6},{0x2DD0,6},{0x2DD8,6},{0x2E2F,0},{0x3005,1}, {0x3031,4},{0x303B,1},{0x3041,85},{0x309D,2},{0x30A1,89},{0x30FC,3},{0x3105,40},{0x3131,93}, {0x31A0,23},{0x31F0,15},{0x3400,0},{0x4DB5,0},{0x4E00,0},{0x9FCB,0},{0xA000,1164},{0xA4D0,45}, {0xA500,268},{0xA610,15},{0xA62A,1},{0xA640,31},{0xA662,12},{0xA67F,24},{0xA6A0,69},{0xA717,8}, {0xA722,102},{0xA78B,1},{0xA7FB,6},{0xA803,2},{0xA807,3},{0xA80C,22},{0xA840,51},{0xA882,49}, {0xA8F2,5},{0xA8FB,0},{0xA90A,27},{0xA930,22},{0xA960,28},{0xA984,46},{0xA9CF,0},{0xAA00,40}, {0xAA40,2},{0xAA44,7},{0xAA60,22},{0xAA7A,0},{0xAA80,47},{0xAAB1,0},{0xAAB5,1},{0xAAB9,4}, {0xAAC0,0},{0xAAC2,0},{0xAADB,2},{0xABC0,34},{0xAC00,0},{0xD7A3,0},{0xD7B0,22},{0xD7CB,48}, {0xF900,301},{0xFA30,61},{0xFA70,105},{0xFB00,6},{0xFB13,4},{0xFB1D,0},{0xFB1F,9},{0xFB2A,12}, {0xFB38,4},{0xFB3E,0},{0xFB40,1},{0xFB43,1},{0xFB46,107},{0xFBD3,362},{0xFD50,63},{0xFD92,53}, {0xFDF0,11},{0xFE70,4},{0xFE76,134},{0xFF21,25},{0xFF41,25},{0xFF66,88},{0xFFC2,5},{0xFFCA,5}, {0xFFD2,5},{0xFFDA,2},{0x10000,11},{0x1000D,25},{0x10028,18},{0x1003C,1},{0x1003F,14},{0x10050,13}, {0x10080,122},{0x10280,28},{0x102A0,48},{0x10300,30},{0x10330,16},{0x10342,7},{0x10380,29},{0x103A0,35}, {0x103C8,7},{0x10400,157},{0x10800,5},{0x10808,0},{0x1080A,43},{0x10837,1},{0x1083C,0},{0x1083F,22}, {0x10900,21},{0x10920,25},{0x10A00,0},{0x10A10,3},{0x10A15,2},{0x10A19,26},{0x10A60,28},{0x10B00,53}, {0x10B40,21},{0x10B60,18},{0x10C00,72},{0x11083,44},{0x12000,878},{0x13000,1070},{0x1D400,84},{0x1D456,70}, {0x1D49E,1},{0x1D4A2,0},{0x1D4A5,1},{0x1D4A9,3},{0x1D4AE,11},{0x1D4BB,0},{0x1D4BD,6},{0x1D4C5,64}, {0x1D507,3},{0x1D50D,7},{0x1D516,6},{0x1D51E,27},{0x1D53B,3},{0x1D540,4},{0x1D546,0},{0x1D54A,6}, {0x1D552,339},{0x1D6A8,24},{0x1D6C2,24},{0x1D6DC,30},{0x1D6FC,24},{0x1D716,30},{0x1D736,24},{0x1D750,30}, {0x1D770,24},{0x1D78A,30},{0x1D7AA,24},{0x1D7C4,7},{0x20000,0},{0x2A6D6,0},{0x2A700,0},{0x2B734,0}, {0x2F800,541},{0x7FFFFFFF,0}}; bool isAlpha(int a) { static int start_i = 0; int i; if(a <= 0) return false; if(a < Cletters[start_i].L) { for(i=0;;++i) { if(a < Cletters[i].L) { start_i = --i; return a <= Cletters[i].L+Cletters[i].range; } } } else { for(i=start_i+1;;++i) { if(a < Cletters[i].L) { start_i = --i; return a <= Cletters[i].L+Cletters[i].range; } } } } #endif static int convertLetter(int a,struct ccaseconv * T) { int i; if(a > 0x10FFFF) return a; for(i=0;;++i) { if((unsigned int)a < T[i].L) { if(i == 0) return a; --i; if ( (unsigned int)a <= T[i].L+T[i].range && ( T[i].inc < 2 || !((a - T[i].L) & 1) ) ) { return a + T[i].dif; } else { break; } } } return a; } bool isUpper(int kar) { #if 0 int ind = kar % ARRSIZE; return ((int)Letter[ind].Unfolded == kar) && Letter[ind].Unfolded == Letter[ind].Capital; #else return kar == convertLetter(kar,l2u); #endif } bool isLower(int kar) { #if 0 int ind = kar % ARRSIZE; return ((int)Letter[ind].Unfolded == kar) && Letter[ind].Unfolded == Letter[ind].Simple;// && Letter[ind].Capital != 0; #else return kar == convertLetter(kar,u2l); #endif } /* static int toUpperUnicode(int a) { return convertLetter(a,l2u); } static int toLowerUnicode(int a) { return convertLetter(a,u2l); } */ unsigned int lowerEquivalent(int kar) { if(kar == 'I' && !Turcic) return 'i'; return convertLetter(kar,u2l); /* int ind = kar % ARRSIZE; return ((int)Letter[ind].Unfolded == kar) && (Letter[ind].Simple) ? Letter[ind].Simple : kar; */ } unsigned int upperEquivalent(int kar) { if(kar == 'i' && !Turcic) return 'I'; return convertLetter(kar,u2l); /* int ind = kar % ARRSIZE; return ((int)Letter[ind].Unfolded == kar) && (Letter[ind].Capital) ? Letter[ind].Capital : kar; */ } int folded::C() { if(inFull) { if(i > 2 || w[i] == 0) inFull = false; else return w[i++]; } int kar = c(); int ind = kar % ARRSIZE; if((int)Letter[ind].Unfolded == kar) { tri * Tri = Letter[ind].Full; if( Tri && (w = Tri->w)[1] /* If the second element is zero, we have a Turkic form, not a fully folded form. */ ) { inFull = true; i = 1; return w[0]; } return Letter[ind].Simple; } return kar; } class charfolded:public folded // ISO { private: const char * s; protected: virtual int c() { return *s++; } public: charfolded(const char * s):s(s) { } virtual ~charfolded() { } }; class wcharfolded:public folded // UTF16 { private: const wchar_t * s; protected: virtual int c() { return *s++; } public: wcharfolded(const wchar_t * s):s(s) { } virtual ~wcharfolded() { } }; int strCaseCmp(const wchar_t *s, const char *p) { wcharfolded S(s); charfolded P(p); int iS,iP; //while(true) for(;;) { iS = S.C(); iP = P.C(); if(iS) { if(iP) { if(iS != iP) return iS - iP; } else return 1; } else { if(iP) return -1; else return 0; } } } #endif