zoukankan      html  css  js  c++  java
  • 如何为Postgresql数据库全文搜索(full text search)编写解析器(parser)

    用英文写的,主要是命令和代码,就不翻译了,偷懒一下。

    This article show you how to create a parser to handle camel case [3] in string in Postgresql full text search. This parser is tested on Postgresql 8.3.9 version. The OS is ubuntu 8.10.

    [1,2] will give you basic background and sample for old version. I update the code to fit 8.3.9 version, especial lots of changes in the SQL statement.

    Create parser

    First, create a directory for put all files, for example: /home/user/Desktop/test_parser

    Create file "test_parser.c". I use vim.

    #include "postgres.h"

    #include "utils/builtins.h"





    #ifdef PG_MODULE_MAGIC

    PG_MODULE_MAGIC;

    #endif



    /*

    * types

    */



    /* self-defined type */

    typedef struct {

    char * buffer; /* text to parse */

    int len; /* length of the text in buffer */

    int pos; /* position of the parser */

    } ParserState;



    /* copy-paste from wparser.h of tsearch2 */

    typedef struct {

    int lexid;

    char *alias;

    char *descr;

    } LexDescr;



    /*

    * prototypes

    */

    PG_FUNCTION_INFO_V1(testprs_start);

    Datum testprs_start(PG_FUNCTION_ARGS);



    PG_FUNCTION_INFO_V1(testprs_getlexeme);

    Datum testprs_getlexeme(PG_FUNCTION_ARGS);



    PG_FUNCTION_INFO_V1(testprs_end);

    Datum testprs_end(PG_FUNCTION_ARGS);



    PG_FUNCTION_INFO_V1(testprs_lextype);

    Datum testprs_lextype(PG_FUNCTION_ARGS);



    /*

    * functions

    */

    Datum testprs_start
    (PG_FUNCTION_ARGS)

    {

    ParserState *pst = (ParserState *) palloc(sizeof(ParserState));

    pst->buffer = (char *) PG_GETARG_POINTER(0);

    pst->len = PG_GETARG_INT32(1);

    pst->pos = 0;

    PG_RETURN_POINTER(pst);

    }



    Datum testprs_getlexeme
    (PG_FUNCTION_ARGS)

    {

    ParserState *pst = (ParserState *) PG_GETARG_POINTER(0);

    char **t = (char **) PG_GETARG_POINTER(1);

    int *tlen = (int *) PG_GETARG_POINTER(2);

    int type;



    *tlen = pst->pos;

    *t = pst->buffer + pst->pos;



    HTML clipboard/* main process here */
      if (((pst->buffer)[pst->pos] >= 'A' && (pst->buffer)[pst->pos] <= 'Z') && (pst->pos < pst->len)) {
     /* word type */
     type = 3;
     (pst->pos)++;
     /* for case like: ItemURL => Item URL*/
     if(((pst->buffer)[pst->pos] >= 'A' && (pst->buffer)[pst->pos] <= 'Z') && (pst->pos < pst->len)){
     while (((pst->buffer)[pst->pos] >= 'A' && (pst->buffer)[pst->pos] <= 'Z') && (pst->pos < pst->len)) {
     (pst->pos)++;
     }
     }
     
     /* go to the next upper case character */
     while (((pst->buffer)[pst->pos] >= 'a' && (pst->buffer)[pst->pos] <= 'z') && (pst->pos < pst->len)) {
     (pst->pos)++;
     }
     }
     
     if (((pst->buffer)[pst->pos] >= 'a' && (pst->buffer)[pst->pos] <= 'z') && (pst->pos < pst->len)) {
     /* word type */
     type = 3;
     (pst->pos)++;
     /* go to the next upper case character */
     while (((pst->buffer)[pst->pos] >= 'a' && (pst->buffer)[pst->pos] <= 'z') && (pst->pos < pst->len)) {
     (pst->pos)++;
     }
     }



    *tlen = pst->pos - *tlen;



    /* we are finished if (*tlen == 0) */

    if (*tlen == 0) type=0;

    PG_RETURN_INT32(type);

    }



    Datum testprs_end
    (PG_FUNCTION_ARGS)

    {

    ParserState *pst = (ParserState *) PG_GETARG_POINTER(0);

    pfree(pst);

    PG_RETURN_VOID();

    }



    Datum testprs_lextype
    (PG_FUNCTION_ARGS)

    {

    /*

    Remarks:

    - we have to return the blanks for headline reason

    - we use the same lexids like Teodor in the default

    word parser; in this way we can reuse the headline

    function of the default word parser.

    */

    LexDescr *descr = (LexDescr *) palloc(sizeof(LexDescr) * (2+1));



    /* there are only two types in this parser */

    descr[0].lexid = 3;

    descr[0].alias = pstrdup("word");

    descr[0].descr = pstrdup("Word");

    descr[1].lexid = 12;

    descr[1].alias = pstrdup("blank");

    descr[1].descr = pstrdup("Space symbols");

    descr[2].lexid = 0;

    PG_RETURN_POINTER(descr);

    }

    Create "Makefile" file under the same directory.

    override CPPFLAGS := -I. $(CPPFLAGS)
    MODULE_big = test_parser
    OBJS = test_parser.o

    DATA_built = test_parser.sql
    DATA =
    REGRESS = test_parser

    ifdef USE_PGXS
    PGXS := $(shell pg_config --pgxs)
    include $(PGXS)
    else
    subdir = contrib/test_parser
    top_builddir = ../..
    include $(top_builddir)/src/Makefile.global
    include $(top_srcdir)/contrib/contrib-global.mk
    endif

    Create "test_parser.sql" file under the same directory.

    SET search_path = public;
    BEGIN;

    --DROP TEXT SEARCH CONFIGURATION testcfg;
    --DROP TEXT SEARCH PARSER testparser;

    CREATE OR REPLACE FUNCTION testprs_start(internal,int4)
    RETURNS internal
    AS '$libdir/test_parser'
    LANGUAGE 'C';

    CREATE OR REPLACE FUNCTION testprs_getlexeme(internal,internal,internal)
    RETURNS internal
    AS '$libdir/test_parser'
    LANGUAGE 'C';

    CREATE OR REPLACE FUNCTION testprs_end(internal)
    RETURNS void
    AS '$libdir/test_parser'
    LANGUAGE 'C';

    CREATE OR REPLACE FUNCTION testprs_lextype(internal)
    RETURNS internal
    AS '$libdir/test_parser'
    LANGUAGE 'C';

    CREATE TEXT SEARCH PARSER testparser(
    START = 'testprs_start',
    GETTOKEN = 'testprs_getlexeme',
    END = 'testprs_end',
    LEXTYPES = 'testprs_lextype'
    );

    CREATE TEXT SEARCH CONFIGURATION testcfg (
    PARSER ='testparser'
    );

    ALTER TEXT SEARCH CONFIGURATION testcfg ADD MAPPING FOR word WITH english_stem;


    END;

    Note: the bold part is total changed for the 8.3.9 version. "DROP TEXT SEARCH " is very useful if you make modification and load the script again.

    Install the parser:

    • open a terminal, switch to the directory you just create. For example: cd /home/user/Desktop/test_parser
    • type: sudo PATH=/usr/local/pgsql/bin:$PATH USE_PGXS=1 make install
    • switch to user "prostgres", type: /usr/local/pgsql/bin/psql -f /usr/local/pgsql/share/contrib/test_parser.sql your-database
      SET
      BEGIN
      CREATE FUNCTION
      CREATE FUNCTION
      CREATE FUNCTION
      CREATE FUNCTION
      DROP TEXT SEARCH CONFIGURATION
      DROP TEXT SEARCH PARSER
      CREATE TEXT SEARCH PARSER
      CREATE TEXT SEARCH CONFIGURATION
      ALTER TEXT SEARCH CONFIGURATION
      COMMIT
    • type:  /usr/local/pgsql/bin/psql postgres
    • test the parser:
      postgres=# SELECT to_tsvector('testcfg','itemValue');
      to_tsvector
      --------------------
      'item':1 'value':2
      (1 row)

      postgres=# SELECT to_tsvector('testcfg','ItemValue');
      to_tsvector
      --------------------
      'item':1 'value':2
      (1 row)

      postgres=# SELECT to_tsquery('testcfg','itemsValue');
      to_tsquery
      -------------------
      'items' & 'value'
      (1 row)

    Reference:
    [1] http://www.sai.msu.su/~megera/postgres/gist/tsearch/V2/docs/HOWTO-parser-tsearch2.html
    [2] http://www.pgcon.org/2007/schedule/attachments/12-fts.pdf
    [3] http://en.wikipedia.org/wiki/CamelCase
    [4] http://www.sai.msu.su/~megera/postgres/fts/doc/sql-fts-createmap.html
    [5] http://www.postgresql.org/docs/8.3/static/textsearch-configuration.html
    [6] http://developer.postgresql.org/pgdocs/postgres/sql-createtsconfig.html
    [7] http://www.postgresql.org/docs/7.4/interactive/sql-createfunction.html

  • 相关阅读:
    TextView-setCompondDrawables用法
    android 相对布局RelativeLayout中的一些属性的使用和实例
    登录时旋转等待效果
    使用slidingmeu_actionbarsherlock_lib的问题和The hierarchy of the type MainActivity is inconsistent
    ActionBarSherlock SlidingMenu整合,解决SlidingMenu example的getSupportActionBar()方法不能用问题
    String,StringBuffer和StringBuilder的区别
    File.separator使用
    Android常用异步任务执行方法
    adb server is out of date. killing...
    adb shell root
  • 原文地址:https://www.cnblogs.com/ainima/p/6331346.html
Copyright © 2011-2022 走看看