package es.udc.lbd.galicianstemmer; /** Lematizador para o galego. Elimina os sufixos e trata de extraer a raíz do termo. */ public class GalicianStemmer { private static int ADV = 1; private static int AUG = 46; private static int UNI = 27; private static int NOUN = 61; private static int VERB = 169; private static int PLURAL = 20; private static int VOW = 13; private char[] word; private int wlength; public GalicianStemmer() {} /** * Fai a lematización da palabra contida no String, e devolve a raíz noutro String. * @param palabra O termo que queremos lematizar. * @return A raíz da palabra. */ public String stem(String palabra) { return new String(this.stem(palabra.toCharArray())); } /** * Fai a lematización da palabra contida no array de chars, e devolve a raíz noutro array de chars.. * @param palabra O termo que queremos lematizar. * @return A raíz da palabra. */ public char[] stem(char[] palabra) { this.word = palabra; int i = 0; if ((word != null) && (word.length > 0)) { toLowerCase(); wlength = word.length; if (ends(word, new char[] {'s'}) && (wlength >= 3)) plural(); unification(); adverb(); while (augmentative()); if (!noun()) { verb(); } thematicVowel(); removeAccents(); } return word; } private void toLowerCase() { for (int i=0 ; i < word.length; i++) { if ((word[i] >= 'A') && (word[i] <= 'Z')) { word[i] += 'a' - 'A'; } else switch (word[i]) { case 'Ã': word[i] = 'ã'; case 'Á': word[i] = 'á'; case 'À': word[i] = 'à'; case 'É': word[i] = 'é'; case 'Ê': word[i] = 'ê'; case 'Í': word[i] = 'í'; case 'Ó': word[i] = 'ó'; case 'Õ': word[i] = 'õ'; case 'Ô': word[i] = 'ô'; case 'Ú': word[i] = 'ú'; } } } private boolean ends(char w[], char s[]) { int i = s.length; int j = w.length; while (i > 0 && j > 0) { if (w[j-1] != s[i-1]) { i=0; return false; } else { i--; j--; } } return true; } private boolean stripSuffix(StemStruct stem) { int wlen = word.length; int slen = stem.suffix.length; char[] nuevo = new char[Math.max(0, wlen - slen)]; if ((wlen - slen >= stem.minStemSize) && (ends(word, stem.suffix)) && !(exceptions(stem.exceptions))) { for (int i = 0; i < wlen - slen; i++) nuevo[i] = word[i]; word = nuevo; append(stem.replacement); return true; } else return false; } private boolean stripSuffix2(StemStruct stem) { int wlen = word.length; int slen = stem.suffix.length; char[] nuevo = new char[Math.max(0,wlen - slen)]; if ((wlen-slen >= stem.minStemSize) && (ends(word,stem.suffix)) && !(exceptions2(stem.exceptions))) { for (int i = 0; i < wlen - slen; i++) nuevo[i] = word[i]; word = nuevo; append(stem.replacement); return true; } else return false; } private boolean exceptions(String[] ex) { int i=0; boolean done = false; while ((i < ex.length) && (!done) && (ex[i].length()) > 0) { if ((ends(word, ex[i].toCharArray()))) { done = true; return true; } else i++; if (i == ex.length) return false; } return false; } private boolean exceptions2(String[] ex) { int i=0; boolean done = false; while ((i < ex.length) && (!done) && (ex[i].length()) > 0) { if (ex[i].equals(new String(word))) { done = true; return true; } else i++; if (i == ex.length) return false; } return false; } private void append(char r[]) { int wlen = word.length; int rlen = r.length; char[] nueva = new char[wlen + rlen]; for (int i=0; i < wlen; i++) nueva[i] = word[i]; for (int j=0; j < rlen; j++) nueva[wlen + j] = r[j]; word = nueva; } private void plural() { StemStruct[] pluralSuffix = { new StemStruct("ns",1,"n",new String[] {"luns", "furatapóns", "furatapons"}), new StemStruct("ós",3,"ón", new String[0]), new StemStruct("ões",3,"ón",new String[0]), new StemStruct("ães",1,"ão",(new String[] {"mães", "magalhães"})), new StemStruct("ais",2,"al",new String[] {"cais","tais", "mais", "pais", "ademais"}), new StemStruct("áis",2,"al",new String[] {"cáis","táis", "máis", "páis", "ademáis"}), new StemStruct("éis",2,"el",new String[0]), new StemStruct("eis",2,"el",new String[0]), new StemStruct("óis",2,"ol",new String[] {"escornabóis"}), new StemStruct("ois",2,"ol",new String[] {"escornabois"}), new StemStruct("ís",2,"il",new String[] {"país"}), new StemStruct("is",2,"il",new String[] {"Menfis", "pais", "Kinguís"}), new StemStruct("les",2,"l",new String[] {"ingles", "marselles", "montreales", "senegales", "manizales", "móstoles", "nápoles"}), new StemStruct("res",3,"r",new String[] {"petres", "henares", "cáceres", "baleares", "linares", "londres", "mieres", "miraflores", "mércores", "venres", "pires"}), new StemStruct("ces",2,"z", new String[0]), new StemStruct("zes",2,"z", new String[0]), new StemStruct("ises",3,"z", new String[0]), new StemStruct("ás",1,"al", new String[] {"más"}), new StemStruct("ses",2,"s", new String[0]), new StemStruct("s",2,"",new String[] {"barbadés", "barcelonés", "cantonés", "gabonés", "llanés", "medinés", "escocés", "escocês", "francês", "barcelonês", "cantonês", "macramés", "reves", "barcelones", "cantones", "gabones", "llanes", "magallanes", "medines", "escoces", "frances", "xoves", "martes", "aliás","pires","lápis","cais","mais", "mas","menos", "férias","pêsames","crúcis", "país", "cangas", "atenas", "asturias", "canarias", "filipinas", "honduras", "molucas", "caldas", "mascareñas", "micenas", "covarrubias", "psoas", "óculos", "nupcias", "xoves", "martes", "llanes"})}; int i=0; boolean done = false; while ((i= VERB-2) return false; } return false; } void thematicVowel() { StemStruct vowel[] = { new StemStruct("gue",2,"g", new String[] {"azougue", "dengue", "merengue", "nurague", "merengue", "rengue"}), new StemStruct("que",2, "c", new String[] {"alambique", "albaricoque", "abaroque", "alcrique", "almadraque", "almanaque", "arenque", "arinque", "baduloque", "ballestrinque", "betoque", "bivaque", "bloque", "bodaque", "bosque", "breque", "buque", "cacique", "cheque", "claque", "contradique", "coque", "croque", "dique", "duque", "enroque", "espeque", "estoque", "estoraque", "estraloque", "estrinque", "milicroque", "monicreque", "orinque", "arinque", "palenque", "parque", "penique", "picabeque", "pique", "psique", "raque", "remolque", "xeque", "repenique", "roque", "sotobosque", "tabique", "tanque", "toque", "traque", "truque", "vivaque", "xaque"}), new StemStruct("a",3,"", new String[] {"amasadela", "cerva"}), new StemStruct("e",3,"", new String[] {"marte"}), new StemStruct("o",3,"", new String[] {"barro", "fado", "cabo", "libro", "cervo"}), new StemStruct("â",3,"", new String[0]), new StemStruct("ã",3,"", new String[] {"Amanhã", "arapuã", "fã", "divã", "manhã"}), new StemStruct("ê",3,"", new String[0]), new StemStruct("ô",3,"", new String[0]), new StemStruct("á",3,"", new String[0]), new StemStruct("é",3,"", new String[0]), new StemStruct("ó",3,"", new String[0]), new StemStruct("i",3,"", new String[0]), }; int i=0; boolean done=false ; while ((i= 0) { if (word[i] == 'á') word[i] = 'a'; else if (word[i] == 'ó') word[i] = 'o'; else if (word[i] == 'é') word[i] = 'e'; else if (word[i] == 'ê') word[i] = 'e'; else if (word[i] == 'í') word[i] = 'i'; else if (word[i] == 'ú') word[i] = 'u'; i--; } } } class StemStruct { char[] suffix; int minStemSize; char[] replacement; String[] exceptions; public StemStruct(String suffix, int minStemSize, String replacement, String[] exceptions) { this.suffix = suffix.toCharArray(); this.minStemSize = minStemSize; this.replacement = replacement.toCharArray(); this.exceptions = exceptions; } }