zoukankan      html  css  js  c++  java
  • English Morphology

    最近参与一个小project,需要编写一个针对英文单词的stem 算法。

    1. 最为常见的stem 算法 就是The English (Porter2) stemming algorithm http://snowball.tartarus.org/algorithms/english/stemmer.html

    // This file was generated automatically by the Snowball to Java compiler
    
    package org.tartarus.snowball.ext;
    
    import org.tartarus.snowball.Among;
    
     /**
      * This class was automatically generated by a Snowball to Java compiler 
      * It implements the stemming algorithm defined by a snowball script.
      */
    
    public class englishStemmer extends org.tartarus.snowball.SnowballStemmer {
    
    private static final long serialVersionUID = 1L;
    
            private final static englishStemmer methodObject = new englishStemmer ();
    
                    private final static Among a_0[] = {
                        new Among ( "arsen", -1, -1, "", methodObject ),
                        new Among ( "commun", -1, -1, "", methodObject ),
                        new Among ( "gener", -1, -1, "", methodObject )
                    };
    
                    private final static Among a_1[] = {
                        new Among ( "'", -1, 1, "", methodObject ),
                        new Among ( "'s'", 0, 1, "", methodObject ),
                        new Among ( "'s", -1, 1, "", methodObject )
                    };
    
                    private final static Among a_2[] = {
                        new Among ( "ied", -1, 2, "", methodObject ),
                        new Among ( "s", -1, 3, "", methodObject ),
                        new Among ( "ies", 1, 2, "", methodObject ),
                        new Among ( "sses", 1, 1, "", methodObject ),
                        new Among ( "ss", 1, -1, "", methodObject ),
                        new Among ( "us", 1, -1, "", methodObject )
                    };
    
                    private final static Among a_3[] = {
                        new Among ( "", -1, 3, "", methodObject ),
                        new Among ( "bb", 0, 2, "", methodObject ),
                        new Among ( "dd", 0, 2, "", methodObject ),
                        new Among ( "ff", 0, 2, "", methodObject ),
                        new Among ( "gg", 0, 2, "", methodObject ),
                        new Among ( "bl", 0, 1, "", methodObject ),
                        new Among ( "mm", 0, 2, "", methodObject ),
                        new Among ( "nn", 0, 2, "", methodObject ),
                        new Among ( "pp", 0, 2, "", methodObject ),
                        new Among ( "rr", 0, 2, "", methodObject ),
                        new Among ( "at", 0, 1, "", methodObject ),
                        new Among ( "tt", 0, 2, "", methodObject ),
                        new Among ( "iz", 0, 1, "", methodObject )
                    };
    
                    private final static Among a_4[] = {
                        new Among ( "ed", -1, 2, "", methodObject ),
                        new Among ( "eed", 0, 1, "", methodObject ),
                        new Among ( "ing", -1, 2, "", methodObject ),
                        new Among ( "edly", -1, 2, "", methodObject ),
                        new Among ( "eedly", 3, 1, "", methodObject ),
                        new Among ( "ingly", -1, 2, "", methodObject )
                    };
    
                    private final static Among a_5[] = {
                        new Among ( "anci", -1, 3, "", methodObject ),
                        new Among ( "enci", -1, 2, "", methodObject ),
                        new Among ( "ogi", -1, 13, "", methodObject ),
                        new Among ( "li", -1, 16, "", methodObject ),
                        new Among ( "bli", 3, 12, "", methodObject ),
                        new Among ( "abli", 4, 4, "", methodObject ),
                        new Among ( "alli", 3, 8, "", methodObject ),
                        new Among ( "fulli", 3, 14, "", methodObject ),
                        new Among ( "lessli", 3, 15, "", methodObject ),
                        new Among ( "ousli", 3, 10, "", methodObject ),
                        new Among ( "entli", 3, 5, "", methodObject ),
                        new Among ( "aliti", -1, 8, "", methodObject ),
                        new Among ( "biliti", -1, 12, "", methodObject ),
                        new Among ( "iviti", -1, 11, "", methodObject ),
                        new Among ( "tional", -1, 1, "", methodObject ),
                        new Among ( "ational", 14, 7, "", methodObject ),
                        new Among ( "alism", -1, 8, "", methodObject ),
                        new Among ( "ation", -1, 7, "", methodObject ),
                        new Among ( "ization", 17, 6, "", methodObject ),
                        new Among ( "izer", -1, 6, "", methodObject ),
                        new Among ( "ator", -1, 7, "", methodObject ),
                        new Among ( "iveness", -1, 11, "", methodObject ),
                        new Among ( "fulness", -1, 9, "", methodObject ),
                        new Among ( "ousness", -1, 10, "", methodObject )
                    };
    
                    private final static Among a_6[] = {
                        new Among ( "icate", -1, 4, "", methodObject ),
                        new Among ( "ative", -1, 6, "", methodObject ),
                        new Among ( "alize", -1, 3, "", methodObject ),
                        new Among ( "iciti", -1, 4, "", methodObject ),
                        new Among ( "ical", -1, 4, "", methodObject ),
                        new Among ( "tional", -1, 1, "", methodObject ),
                        new Among ( "ational", 5, 2, "", methodObject ),
                        new Among ( "ful", -1, 5, "", methodObject ),
                        new Among ( "ness", -1, 5, "", methodObject )
                    };
    
                    private final static Among a_7[] = {
                        new Among ( "ic", -1, 1, "", methodObject ),
                        new Among ( "ance", -1, 1, "", methodObject ),
                        new Among ( "ence", -1, 1, "", methodObject ),
                        new Among ( "able", -1, 1, "", methodObject ),
                        new Among ( "ible", -1, 1, "", methodObject ),
                        new Among ( "ate", -1, 1, "", methodObject ),
                        new Among ( "ive", -1, 1, "", methodObject ),
                        new Among ( "ize", -1, 1, "", methodObject ),
                        new Among ( "iti", -1, 1, "", methodObject ),
                        new Among ( "al", -1, 1, "", methodObject ),
                        new Among ( "ism", -1, 1, "", methodObject ),
                        new Among ( "ion", -1, 2, "", methodObject ),
                        new Among ( "er", -1, 1, "", methodObject ),
                        new Among ( "ous", -1, 1, "", methodObject ),
                        new Among ( "ant", -1, 1, "", methodObject ),
                        new Among ( "ent", -1, 1, "", methodObject ),
                        new Among ( "ment", 15, 1, "", methodObject ),
                        new Among ( "ement", 16, 1, "", methodObject )
                    };
    
                    private final static Among a_8[] = {
                        new Among ( "e", -1, 1, "", methodObject ),
                        new Among ( "l", -1, 2, "", methodObject )
                    };
    
                    private final static Among a_9[] = {
                        new Among ( "succeed", -1, -1, "", methodObject ),
                        new Among ( "proceed", -1, -1, "", methodObject ),
                        new Among ( "exceed", -1, -1, "", methodObject ),
                        new Among ( "canning", -1, -1, "", methodObject ),
                        new Among ( "inning", -1, -1, "", methodObject ),
                        new Among ( "earring", -1, -1, "", methodObject ),
                        new Among ( "herring", -1, -1, "", methodObject ),
                        new Among ( "outing", -1, -1, "", methodObject )
                    };
    
                    private final static Among a_10[] = {
                        new Among ( "andes", -1, -1, "", methodObject ),
                        new Among ( "atlas", -1, -1, "", methodObject ),
                        new Among ( "bias", -1, -1, "", methodObject ),
                        new Among ( "cosmos", -1, -1, "", methodObject ),
                        new Among ( "dying", -1, 3, "", methodObject ),
                        new Among ( "early", -1, 9, "", methodObject ),
                        new Among ( "gently", -1, 7, "", methodObject ),
                        new Among ( "howe", -1, -1, "", methodObject ),
                        new Among ( "idly", -1, 6, "", methodObject ),
                        new Among ( "lying", -1, 4, "", methodObject ),
                        new Among ( "news", -1, -1, "", methodObject ),
                        new Among ( "only", -1, 10, "", methodObject ),
                        new Among ( "singly", -1, 11, "", methodObject ),
                        new Among ( "skies", -1, 2, "", methodObject ),
                        new Among ( "skis", -1, 1, "", methodObject ),
                        new Among ( "sky", -1, -1, "", methodObject ),
                        new Among ( "tying", -1, 5, "", methodObject ),
                        new Among ( "ugly", -1, 8, "", methodObject )
                    };
    
                    private static final char g_v[] = {17, 65, 16, 1 };
    
                    private static final char g_v_WXY[] = {1, 17, 65, 208, 1 };
    
                    private static final char g_valid_LI[] = {55, 141, 2 };
    
            private boolean B_Y_found;
            private int I_p2;
            private int I_p1;
    
                    private void copy_from(englishStemmer other) {
                        B_Y_found = other.B_Y_found;
                        I_p2 = other.I_p2;
                        I_p1 = other.I_p1;
                        super.copy_from(other);
                    }
    
                    private boolean r_prelude() {
                int v_1;
                int v_2;
                int v_3;
                int v_4;
                int v_5;
                        // (, line 25
                        // unset Y_found, line 26
                        B_Y_found = false;
                        // do, line 27
                        v_1 = cursor;
                        lab0: do {
                            // (, line 27
                            // [, line 27
                            bra = cursor;
                            // literal, line 27
                            if (!(eq_s(1, "'")))
                            {
                                break lab0;
                            }
                            // ], line 27
                            ket = cursor;
                            // delete, line 27
                            slice_del();
                        } while (false);
                        cursor = v_1;
                        // do, line 28
                        v_2 = cursor;
                        lab1: do {
                            // (, line 28
                            // [, line 28
                            bra = cursor;
                            // literal, line 28
                            if (!(eq_s(1, "y")))
                            {
                                break lab1;
                            }
                            // ], line 28
                            ket = cursor;
                            // <-, line 28
                            slice_from("Y");
                            // set Y_found, line 28
                            B_Y_found = true;
                        } while (false);
                        cursor = v_2;
                        // do, line 29
                        v_3 = cursor;
                        lab2: do {
                            // repeat, line 29
                            replab3: while(true)
                            {
                                v_4 = cursor;
                                lab4: do {
                                    // (, line 29
                                    // goto, line 29
                                    golab5: while(true)
                                    {
                                        v_5 = cursor;
                                        lab6: do {
                                            // (, line 29
                                            if (!(in_grouping(g_v, 97, 121)))
                                            {
                                                break lab6;
                                            }
                                            // [, line 29
                                            bra = cursor;
                                            // literal, line 29
                                            if (!(eq_s(1, "y")))
                                            {
                                                break lab6;
                                            }
                                            // ], line 29
                                            ket = cursor;
                                            cursor = v_5;
                                            break golab5;
                                        } while (false);
                                        cursor = v_5;
                                        if (cursor >= limit)
                                        {
                                            break lab4;
                                        }
                                        cursor++;
                                    }
                                    // <-, line 29
                                    slice_from("Y");
                                    // set Y_found, line 29
                                    B_Y_found = true;
                                    continue replab3;
                                } while (false);
                                cursor = v_4;
                                break replab3;
                            }
                        } while (false);
                        cursor = v_3;
                        return true;
                    }
    
                    private boolean r_mark_regions() {
                int v_1;
                int v_2;
                        // (, line 32
                        I_p1 = limit;
                        I_p2 = limit;
                        // do, line 35
                        v_1 = cursor;
                        lab0: do {
                            // (, line 35
                            // or, line 41
                            lab1: do {
                                v_2 = cursor;
                                lab2: do {
                                    // among, line 36
                                    if (find_among(a_0, 3) == 0)
                                    {
                                        break lab2;
                                    }
                                    break lab1;
                                } while (false);
                                cursor = v_2;
                                // (, line 41
                                // gopast, line 41
                                golab3: while(true)
                                {
                                    lab4: do {
                                        if (!(in_grouping(g_v, 97, 121)))
                                        {
                                            break lab4;
                                        }
                                        break golab3;
                                    } while (false);
                                    if (cursor >= limit)
                                    {
                                        break lab0;
                                    }
                                    cursor++;
                                }
                                // gopast, line 41
                                golab5: while(true)
                                {
                                    lab6: do {
                                        if (!(out_grouping(g_v, 97, 121)))
                                        {
                                            break lab6;
                                        }
                                        break golab5;
                                    } while (false);
                                    if (cursor >= limit)
                                    {
                                        break lab0;
                                    }
                                    cursor++;
                                }
                            } while (false);
                            // setmark p1, line 42
                            I_p1 = cursor;
                            // gopast, line 43
                            golab7: while(true)
                            {
                                lab8: do {
                                    if (!(in_grouping(g_v, 97, 121)))
                                    {
                                        break lab8;
                                    }
                                    break golab7;
                                } while (false);
                                if (cursor >= limit)
                                {
                                    break lab0;
                                }
                                cursor++;
                            }
                            // gopast, line 43
                            golab9: while(true)
                            {
                                lab10: do {
                                    if (!(out_grouping(g_v, 97, 121)))
                                    {
                                        break lab10;
                                    }
                                    break golab9;
                                } while (false);
                                if (cursor >= limit)
                                {
                                    break lab0;
                                }
                                cursor++;
                            }
                            // setmark p2, line 43
                            I_p2 = cursor;
                        } while (false);
                        cursor = v_1;
                        return true;
                    }
    
                    private boolean r_shortv() {
                int v_1;
                        // (, line 49
                        // or, line 51
                        lab0: do {
                            v_1 = limit - cursor;
                            lab1: do {
                                // (, line 50
                                if (!(out_grouping_b(g_v_WXY, 89, 121)))
                                {
                                    break lab1;
                                }
                                if (!(in_grouping_b(g_v, 97, 121)))
                                {
                                    break lab1;
                                }
                                if (!(out_grouping_b(g_v, 97, 121)))
                                {
                                    break lab1;
                                }
                                break lab0;
                            } while (false);
                            cursor = limit - v_1;
                            // (, line 52
                            if (!(out_grouping_b(g_v, 97, 121)))
                            {
                                return false;
                            }
                            if (!(in_grouping_b(g_v, 97, 121)))
                            {
                                return false;
                            }
                            // atlimit, line 52
                            if (cursor > limit_backward)
                            {
                                return false;
                            }
                        } while (false);
                        return true;
                    }
    
                    private boolean r_R1() {
                        if (!(I_p1 <= cursor))
                        {
                            return false;
                        }
                        return true;
                    }
    
                    private boolean r_R2() {
                        if (!(I_p2 <= cursor))
                        {
                            return false;
                        }
                        return true;
                    }
    
                    private boolean r_Step_1a() {
                int among_var;
                int v_1;
                int v_2;
                        // (, line 58
                        // try, line 59
                        v_1 = limit - cursor;
                        lab0: do {
                            // (, line 59
                            // [, line 60
                            ket = cursor;
                            // substring, line 60
                            among_var = find_among_b(a_1, 3);
                            if (among_var == 0)
                            {
                                cursor = limit - v_1;
                                break lab0;
                            }
                            // ], line 60
                            bra = cursor;
                            switch(among_var) {
                                case 0:
                                    cursor = limit - v_1;
                                    break lab0;
                                case 1:
                                    // (, line 62
                                    // delete, line 62
                                    slice_del();
                                    break;
                            }
                        } while (false);
                        // [, line 65
                        ket = cursor;
                        // substring, line 65
                        among_var = find_among_b(a_2, 6);
                        if (among_var == 0)
                        {
                            return false;
                        }
                        // ], line 65
                        bra = cursor;
                        switch(among_var) {
                            case 0:
                                return false;
                            case 1:
                                // (, line 66
                                // <-, line 66
                                slice_from("ss");
                                break;
                            case 2:
                                // (, line 68
                                // or, line 68
                                lab1: do {
                                    v_2 = limit - cursor;
                                    lab2: do {
                                        // (, line 68
                                        // hop, line 68
                                        {
                                            int c = cursor - 2;
                                            if (limit_backward > c || c > limit)
                                            {
                                                break lab2;
                                            }
                                            cursor = c;
                                        }
                                        // <-, line 68
                                        slice_from("i");
                                        break lab1;
                                    } while (false);
                                    cursor = limit - v_2;
                                    // <-, line 68
                                    slice_from("ie");
                                } while (false);
                                break;
                            case 3:
                                // (, line 69
                                // next, line 69
                                if (cursor <= limit_backward)
                                {
                                    return false;
                                }
                                cursor--;
                                // gopast, line 69
                                golab3: while(true)
                                {
                                    lab4: do {
                                        if (!(in_grouping_b(g_v, 97, 121)))
                                        {
                                            break lab4;
                                        }
                                        break golab3;
                                    } while (false);
                                    if (cursor <= limit_backward)
                                    {
                                        return false;
                                    }
                                    cursor--;
                                }
                                // delete, line 69
                                slice_del();
                                break;
                        }
                        return true;
                    }
    
                    private boolean r_Step_1b() {
                int among_var;
                int v_1;
                int v_3;
                int v_4;
                        // (, line 74
                        // [, line 75
                        ket = cursor;
                        // substring, line 75
                        among_var = find_among_b(a_4, 6);
                        if (among_var == 0)
                        {
                            return false;
                        }
                        // ], line 75
                        bra = cursor;
                        switch(among_var) {
                            case 0:
                                return false;
                            case 1:
                                // (, line 77
                                // call R1, line 77
                                if (!r_R1())
                                {
                                    return false;
                                }
                                // <-, line 77
                                slice_from("ee");
                                break;
                            case 2:
                                // (, line 79
                                // test, line 80
                                v_1 = limit - cursor;
                                // gopast, line 80
                                golab0: while(true)
                                {
                                    lab1: do {
                                        if (!(in_grouping_b(g_v, 97, 121)))
                                        {
                                            break lab1;
                                        }
                                        break golab0;
                                    } while (false);
                                    if (cursor <= limit_backward)
                                    {
                                        return false;
                                    }
                                    cursor--;
                                }
                                cursor = limit - v_1;
                                // delete, line 80
                                slice_del();
                                // test, line 81
                                v_3 = limit - cursor;
                                // substring, line 81
                                among_var = find_among_b(a_3, 13);
                                if (among_var == 0)
                                {
                                    return false;
                                }
                                cursor = limit - v_3;
                                switch(among_var) {
                                    case 0:
                                        return false;
                                    case 1:
                                        // (, line 83
                                        // <+, line 83
                                        {
                                            int c = cursor;
                                            insert(cursor, cursor, "e");
                                            cursor = c;
                                        }
                                        break;
                                    case 2:
                                        // (, line 86
                                        // [, line 86
                                        ket = cursor;
                                        // next, line 86
                                        if (cursor <= limit_backward)
                                        {
                                            return false;
                                        }
                                        cursor--;
                                        // ], line 86
                                        bra = cursor;
                                        // delete, line 86
                                        slice_del();
                                        break;
                                    case 3:
                                        // (, line 87
                                        // atmark, line 87
                                        if (cursor != I_p1)
                                        {
                                            return false;
                                        }
                                        // test, line 87
                                        v_4 = limit - cursor;
                                        // call shortv, line 87
                                        if (!r_shortv())
                                        {
                                            return false;
                                        }
                                        cursor = limit - v_4;
                                        // <+, line 87
                                        {
                                            int c = cursor;
                                            insert(cursor, cursor, "e");
                                            cursor = c;
                                        }
                                        break;
                                }
                                break;
                        }
                        return true;
                    }
    
                    private boolean r_Step_1c() {
                int v_1;
                int v_2;
                        // (, line 93
                        // [, line 94
                        ket = cursor;
                        // or, line 94
                        lab0: do {
                            v_1 = limit - cursor;
                            lab1: do {
                                // literal, line 94
                                if (!(eq_s_b(1, "y")))
                                {
                                    break lab1;
                                }
                                break lab0;
                            } while (false);
                            cursor = limit - v_1;
                            // literal, line 94
                            if (!(eq_s_b(1, "Y")))
                            {
                                return false;
                            }
                        } while (false);
                        // ], line 94
                        bra = cursor;
                        if (!(out_grouping_b(g_v, 97, 121)))
                        {
                            return false;
                        }
                        // not, line 95
                        {
                            v_2 = limit - cursor;
                            lab2: do {
                                // atlimit, line 95
                                if (cursor > limit_backward)
                                {
                                    break lab2;
                                }
                                return false;
                            } while (false);
                            cursor = limit - v_2;
                        }
                        // <-, line 96
                        slice_from("i");
                        return true;
                    }
    
                    private boolean r_Step_2() {
                int among_var;
                        // (, line 99
                        // [, line 100
                        ket = cursor;
                        // substring, line 100
                        among_var = find_among_b(a_5, 24);
                        if (among_var == 0)
                        {
                            return false;
                        }
                        // ], line 100
                        bra = cursor;
                        // call R1, line 100
                        if (!r_R1())
                        {
                            return false;
                        }
                        switch(among_var) {
                            case 0:
                                return false;
                            case 1:
                                // (, line 101
                                // <-, line 101
                                slice_from("tion");
                                break;
                            case 2:
                                // (, line 102
                                // <-, line 102
                                slice_from("ence");
                                break;
                            case 3:
                                // (, line 103
                                // <-, line 103
                                slice_from("ance");
                                break;
                            case 4:
                                // (, line 104
                                // <-, line 104
                                slice_from("able");
                                break;
                            case 5:
                                // (, line 105
                                // <-, line 105
                                slice_from("ent");
                                break;
                            case 6:
                                // (, line 107
                                // <-, line 107
                                slice_from("ize");
                                break;
                            case 7:
                                // (, line 109
                                // <-, line 109
                                slice_from("ate");
                                break;
                            case 8:
                                // (, line 111
                                // <-, line 111
                                slice_from("al");
                                break;
                            case 9:
                                // (, line 112
                                // <-, line 112
                                slice_from("ful");
                                break;
                            case 10:
                                // (, line 114
                                // <-, line 114
                                slice_from("ous");
                                break;
                            case 11:
                                // (, line 116
                                // <-, line 116
                                slice_from("ive");
                                break;
                            case 12:
                                // (, line 118
                                // <-, line 118
                                slice_from("ble");
                                break;
                            case 13:
                                // (, line 119
                                // literal, line 119
                                if (!(eq_s_b(1, "l")))
                                {
                                    return false;
                                }
                                // <-, line 119
                                slice_from("og");
                                break;
                            case 14:
                                // (, line 120
                                // <-, line 120
                                slice_from("ful");
                                break;
                            case 15:
                                // (, line 121
                                // <-, line 121
                                slice_from("less");
                                break;
                            case 16:
                                // (, line 122
                                if (!(in_grouping_b(g_valid_LI, 99, 116)))
                                {
                                    return false;
                                }
                                // delete, line 122
                                slice_del();
                                break;
                        }
                        return true;
                    }
    
                    private boolean r_Step_3() {
                int among_var;
                        // (, line 126
                        // [, line 127
                        ket = cursor;
                        // substring, line 127
                        among_var = find_among_b(a_6, 9);
                        if (among_var == 0)
                        {
                            return false;
                        }
                        // ], line 127
                        bra = cursor;
                        // call R1, line 127
                        if (!r_R1())
                        {
                            return false;
                        }
                        switch(among_var) {
                            case 0:
                                return false;
                            case 1:
                                // (, line 128
                                // <-, line 128
                                slice_from("tion");
                                break;
                            case 2:
                                // (, line 129
                                // <-, line 129
                                slice_from("ate");
                                break;
                            case 3:
                                // (, line 130
                                // <-, line 130
                                slice_from("al");
                                break;
                            case 4:
                                // (, line 132
                                // <-, line 132
                                slice_from("ic");
                                break;
                            case 5:
                                // (, line 134
                                // delete, line 134
                                slice_del();
                                break;
                            case 6:
                                // (, line 136
                                // call R2, line 136
                                if (!r_R2())
                                {
                                    return false;
                                }
                                // delete, line 136
                                slice_del();
                                break;
                        }
                        return true;
                    }
    
                    private boolean r_Step_4() {
                int among_var;
                int v_1;
                        // (, line 140
                        // [, line 141
                        ket = cursor;
                        // substring, line 141
                        among_var = find_among_b(a_7, 18);
                        if (among_var == 0)
                        {
                            return false;
                        }
                        // ], line 141
                        bra = cursor;
                        // call R2, line 141
                        if (!r_R2())
                        {
                            return false;
                        }
                        switch(among_var) {
                            case 0:
                                return false;
                            case 1:
                                // (, line 144
                                // delete, line 144
                                slice_del();
                                break;
                            case 2:
                                // (, line 145
                                // or, line 145
                                lab0: do {
                                    v_1 = limit - cursor;
                                    lab1: do {
                                        // literal, line 145
                                        if (!(eq_s_b(1, "s")))
                                        {
                                            break lab1;
                                        }
                                        break lab0;
                                    } while (false);
                                    cursor = limit - v_1;
                                    // literal, line 145
                                    if (!(eq_s_b(1, "t")))
                                    {
                                        return false;
                                    }
                                } while (false);
                                // delete, line 145
                                slice_del();
                                break;
                        }
                        return true;
                    }
    
                    private boolean r_Step_5() {
                int among_var;
                int v_1;
                int v_2;
                        // (, line 149
                        // [, line 150
                        ket = cursor;
                        // substring, line 150
                        among_var = find_among_b(a_8, 2);
                        if (among_var == 0)
                        {
                            return false;
                        }
                        // ], line 150
                        bra = cursor;
                        switch(among_var) {
                            case 0:
                                return false;
                            case 1:
                                // (, line 151
                                // or, line 151
                                lab0: do {
                                    v_1 = limit - cursor;
                                    lab1: do {
                                        // call R2, line 151
                                        if (!r_R2())
                                        {
                                            break lab1;
                                        }
                                        break lab0;
                                    } while (false);
                                    cursor = limit - v_1;
                                    // (, line 151
                                    // call R1, line 151
                                    if (!r_R1())
                                    {
                                        return false;
                                    }
                                    // not, line 151
                                    {
                                        v_2 = limit - cursor;
                                        lab2: do {
                                            // call shortv, line 151
                                            if (!r_shortv())
                                            {
                                                break lab2;
                                            }
                                            return false;
                                        } while (false);
                                        cursor = limit - v_2;
                                    }
                                } while (false);
                                // delete, line 151
                                slice_del();
                                break;
                            case 2:
                                // (, line 152
                                // call R2, line 152
                                if (!r_R2())
                                {
                                    return false;
                                }
                                // literal, line 152
                                if (!(eq_s_b(1, "l")))
                                {
                                    return false;
                                }
                                // delete, line 152
                                slice_del();
                                break;
                        }
                        return true;
                    }
    
                    private boolean r_exception2() {
                        // (, line 156
                        // [, line 158
                        ket = cursor;
                        // substring, line 158
                        if (find_among_b(a_9, 8) == 0)
                        {
                            return false;
                        }
                        // ], line 158
                        bra = cursor;
                        // atlimit, line 158
                        if (cursor > limit_backward)
                        {
                            return false;
                        }
                        return true;
                    }
    
                    private boolean r_exception1() {
                int among_var;
                        // (, line 168
                        // [, line 170
                        bra = cursor;
                        // substring, line 170
                        among_var = find_among(a_10, 18);
                        if (among_var == 0)
                        {
                            return false;
                        }
                        // ], line 170
                        ket = cursor;
                        // atlimit, line 170
                        if (cursor < limit)
                        {
                            return false;
                        }
                        switch(among_var) {
                            case 0:
                                return false;
                            case 1:
                                // (, line 174
                                // <-, line 174
                                slice_from("ski");
                                break;
                            case 2:
                                // (, line 175
                                // <-, line 175
                                slice_from("sky");
                                break;
                            case 3:
                                // (, line 176
                                // <-, line 176
                                slice_from("die");
                                break;
                            case 4:
                                // (, line 177
                                // <-, line 177
                                slice_from("lie");
                                break;
                            case 5:
                                // (, line 178
                                // <-, line 178
                                slice_from("tie");
                                break;
                            case 6:
                                // (, line 182
                                // <-, line 182
                                slice_from("idl");
                                break;
                            case 7:
                                // (, line 183
                                // <-, line 183
                                slice_from("gentl");
                                break;
                            case 8:
                                // (, line 184
                                // <-, line 184
                                slice_from("ugli");
                                break;
                            case 9:
                                // (, line 185
                                // <-, line 185
                                slice_from("earli");
                                break;
                            case 10:
                                // (, line 186
                                // <-, line 186
                                slice_from("onli");
                                break;
                            case 11:
                                // (, line 187
                                // <-, line 187
                                slice_from("singl");
                                break;
                        }
                        return true;
                    }
    
                    private boolean r_postlude() {
                int v_1;
                int v_2;
                        // (, line 203
                        // Boolean test Y_found, line 203
                        if (!(B_Y_found))
                        {
                            return false;
                        }
                        // repeat, line 203
                        replab0: while(true)
                        {
                            v_1 = cursor;
                            lab1: do {
                                // (, line 203
                                // goto, line 203
                                golab2: while(true)
                                {
                                    v_2 = cursor;
                                    lab3: do {
                                        // (, line 203
                                        // [, line 203
                                        bra = cursor;
                                        // literal, line 203
                                        if (!(eq_s(1, "Y")))
                                        {
                                            break lab3;
                                        }
                                        // ], line 203
                                        ket = cursor;
                                        cursor = v_2;
                                        break golab2;
                                    } while (false);
                                    cursor = v_2;
                                    if (cursor >= limit)
                                    {
                                        break lab1;
                                    }
                                    cursor++;
                                }
                                // <-, line 203
                                slice_from("y");
                                continue replab0;
                            } while (false);
                            cursor = v_1;
                            break replab0;
                        }
                        return true;
                    }
    
                    public boolean stem() {
                int v_1;
                int v_2;
                int v_3;
                int v_4;
                int v_5;
                int v_6;
                int v_7;
                int v_8;
                int v_9;
                int v_10;
                int v_11;
                int v_12;
                int v_13;
                        // (, line 205
                        // or, line 207
                        lab0: do {
                            v_1 = cursor;
                            lab1: do {
                                // call exception1, line 207
                                if (!r_exception1())
                                {
                                    break lab1;
                                }
                                break lab0;
                            } while (false);
                            cursor = v_1;
                            lab2: do {
                                // not, line 208
                                {
                                    v_2 = cursor;
                                    lab3: do {
                                        // hop, line 208
                                        {
                                            int c = cursor + 3;
                                            if (0 > c || c > limit)
                                            {
                                                break lab3;
                                            }
                                            cursor = c;
                                        }
                                        break lab2;
                                    } while (false);
                                    cursor = v_2;
                                }
                                break lab0;
                            } while (false);
                            cursor = v_1;
                            // (, line 208
                            // do, line 209
                            v_3 = cursor;
                            lab4: do {
                                // call prelude, line 209
                                if (!r_prelude())
                                {
                                    break lab4;
                                }
                            } while (false);
                            cursor = v_3;
                            // do, line 210
                            v_4 = cursor;
                            lab5: do {
                                // call mark_regions, line 210
                                if (!r_mark_regions())
                                {
                                    break lab5;
                                }
                            } while (false);
                            cursor = v_4;
                            // backwards, line 211
                            limit_backward = cursor; cursor = limit;
                            // (, line 211
                            // do, line 213
                            v_5 = limit - cursor;
                            lab6: do {
                                // call Step_1a, line 213
                                if (!r_Step_1a())
                                {
                                    break lab6;
                                }
                            } while (false);
                            cursor = limit - v_5;
                            // or, line 215
                            lab7: do {
                                v_6 = limit - cursor;
                                lab8: do {
                                    // call exception2, line 215
                                    if (!r_exception2())
                                    {
                                        break lab8;
                                    }
                                    break lab7;
                                } while (false);
                                cursor = limit - v_6;
                                // (, line 215
                                // do, line 217
                                v_7 = limit - cursor;
                                lab9: do {
                                    // call Step_1b, line 217
                                    if (!r_Step_1b())
                                    {
                                        break lab9;
                                    }
                                } while (false);
                                cursor = limit - v_7;
                                // do, line 218
                                v_8 = limit - cursor;
                                lab10: do {
                                    // call Step_1c, line 218
                                    if (!r_Step_1c())
                                    {
                                        break lab10;
                                    }
                                } while (false);
                                cursor = limit - v_8;
                                // do, line 220
                                v_9 = limit - cursor;
                                lab11: do {
                                    // call Step_2, line 220
                                    if (!r_Step_2())
                                    {
                                        break lab11;
                                    }
                                } while (false);
                                cursor = limit - v_9;
                                // do, line 221
                                v_10 = limit - cursor;
                                lab12: do {
                                    // call Step_3, line 221
                                    if (!r_Step_3())
                                    {
                                        break lab12;
                                    }
                                } while (false);
                                cursor = limit - v_10;
                                // do, line 222
                                v_11 = limit - cursor;
                                lab13: do {
                                    // call Step_4, line 222
                                    if (!r_Step_4())
                                    {
                                        break lab13;
                                    }
                                } while (false);
                                cursor = limit - v_11;
                                // do, line 224
                                v_12 = limit - cursor;
                                lab14: do {
                                    // call Step_5, line 224
                                    if (!r_Step_5())
                                    {
                                        break lab14;
                                    }
                                } while (false);
                                cursor = limit - v_12;
                            } while (false);
                            cursor = limit_backward;                        // do, line 227
                            v_13 = cursor;
                            lab15: do {
                                // call postlude, line 227
                                if (!r_postlude())
                                {
                                    break lab15;
                                }
                            } while (false);
                            cursor = v_13;
                        } while (false);
                        return true;
                    }
    
            public boolean equals( Object o ) {
                return o instanceof englishStemmer;
            }
    
            public int hashCode() {
                return englishStemmer.class.getName().hashCode();
            }
    
    
    
    }
    porter2 stemming algorithm

     然而,porter stemming 仅仅是一个基于后缀的词干提取技术,它仅仅定义了一些基本的后缀规则,能识别出"books"->"book"等. 然而针对一些诸如 "bought"->"buy","brought"->"bring"等异常形式并不能识别出来。

    2. The dragon toolkit (http://dragon.ischool.drexel.edu/download.asp)

    然后发现上面nlp 处理工具,其中的EngLemmatiser 类就是stem类,能提取出单词的词干。

    它首先定义一些基本点后缀规则(只有十几条),然后定义一些独立于这些规则的异常词库(master slave 的形式,这样就能基本实现单词词干的正确提取,解决了porter stemming 存在的问题。

    String dictionaryPath = "lemmatiser";
            EngLemmatiser lemmatiser =  new EngLemmatiser(dictionaryPath, false, true);
    
            String a = "brought";
            String lemmatizedWord = lemmatiser.lemmatize(a);
            System.out.println(lemmatizedWord);
    View Code

    然而我还是觉得,在规则基础之上附加词典的技术过于死板,不够灵活。

    3. Stanford CoreNLP

    后来发现斯坦福大学的一个NLP工具,其中提取词干的技术:针对大量语料库进行机器学习,利用有限自动机提炼并生成规则(不必附加词典)。能完美解决词干的提取问题,准确率很高。它对地名、人名等专有词识别不出来,但达到了基本的需求。

        String word="magnificus";
            Morphology morph=new Morphology();
            System.out.println(morph.stem(word));
    View Code
  • 相关阅读:
    解决ASP.NET MVC AllowAnonymous属性无效导致无法匿名访问控制器的问题
    ASP.NET MVC 4 RC的JS/CSS打包压缩功能 (转载)
    oracle报错ORA-01507
    oracle 大表删除数据后,回收空间的问题。
    解决MySQL服务启动时报1067错误
    尚未在 Web 服务器上注册 ASP.NET 4.0” 的解决办法
    thymeleaf中相对路径的两种方式
    史上最详 Thymeleaf 使用教程
    isNotBlank()和isNotEmpty()总结
    IDEA去除掉虚线,波浪线,和下划线实线的方法
  • 原文地址:https://www.cnblogs.com/kennethshu/p/3833998.html
Copyright © 2011-2022 走看看