zoukankan      html  css  js  c++  java
  • sphinx的配置

    #
    # Sphinx configuration file sample
    #
    # WARNING! While this sample file mentions all available options,
    # it contains (very) short helper descriptions only. Please refer to
    # doc/sphinx.html for details.
    #

    #############################################################################
    ## data source definition
    #############################################################################
    source src2
    {
    type = mysql
    sql_host = localhost
    sql_user = abc
    sql_pass = def
    sql_db = tour_soft
    sql_port = 3306 # optional, default is 3306
    sql_query_pre = SET NAMES utf8
    sql_query_pre = SET SESSION query_cache_type=OFF
    sql_query_pre = REPLACE INTO sph_counter SELECT 1,MAX(id) FROM tour_admin
    sql_query = select id,2 as source_id,username,password from tour_admin where id<=( SELECT max_doc_id FROM sph_counter WHERE counter_id=1 )
    sql_attr_uint = source_id
    sql_ranged_throttle = 0
    sql_query_info = select id,2 as source_id,username,password from tour_admin where id=$id
    }
    #增量源
    source src2throttled : src2
    {
    sql_ranged_throttle = 100
    sql_query_pre = SET NAMES utf8
    sql_query = select id,2 as source_id,username,password from tour_admin WHERE id>( SELECT max_doc_id FROM sph_counter WHERE counter_id=1 )
    }
    #索引
    index test2
    {
    source = src2
    path = /usr/local/coreseek/var/data/test2 #这里的路径不要跟test1里面的一样
    docinfo = extern
    mlock = 0
    morphology = none
    min_word_len = 1
    charset_type = zh_cn.utf-8 #编码
    charset_dictpath=/usr/local/mmseg3/etc #中文包的路径
    html_strip = 0
    }
    #增量索引
    index src2throttled : test2
    {
    source = src2throttled #增量来源地址
    path = /usr/local/coreseek/var/data/src2throttled
    morphology = stem_en
    }


    source src1
    {
    # data source type. mandatory, no default value
    # known types are mysql, pgsql, mssql, xmlpipe, xmlpipe2, odbc
    type = mysql

    #####################################################################
    ## SQL settings (for 'mysql' and 'pgsql' types)
    #####################################################################

    # some straightforward parameters for SQL source types
    sql_host = localhost
    sql_user = abc
    sql_pass = def
    sql_db = tour_soft
    sql_port = 3306 # optional, default is 3306

    # UNIX socket name
    # optional, default is empty (reuse client library defaults)
    # usually '/var/lib/mysql/mysql.sock' on Linux
    # usually '/tmp/mysql.sock' on FreeBSD
    #
    # sql_sock = /tmp/mysql.sock


    # MySQL specific client connection flags
    # optional, default is 0
    #
    # mysql_connect_flags = 32 # enable compression

    # MySQL specific SSL certificate settings
    # optional, defaults are empty
    #
    # mysql_ssl_cert = /etc/ssl/client-cert.pem
    # mysql_ssl_key = /etc/ssl/client-key.pem
    # mysql_ssl_ca = /etc/ssl/cacert.pem

    # MS SQL specific Windows authentication mode flag
    # MUST be in sync with charset_type index-level setting
    # optional, default is 0
    #
    # mssql_winauth = 1 # use currently logged on user credentials


    # MS SQL specific Unicode indexing flag
    # optional, default is 0 (request SBCS data)
    #
    # mssql_unicode = 1 # request Unicode data from server


    # ODBC specific DSN (data source name)
    # mandatory for odbc source type, no default value
    #
    # odbc_dsn = DBQ=C:data;DefaultDir=C:data;Driver={Microsoft Text Driver (*.txt; *.csv)};
    # sql_query = SELECT id, data FROM documents.csv


    # pre-query, executed before the main fetch query
    # multi-value, optional, default is empty list of queries
    #
    sql_query_pre = SET NAMES utf8
    sql_query_pre = SET SESSION query_cache_type=OFF


    # main document fetch query
    # mandatory, integer document ID field MUST be the first selected column
    #sql_query = SELECT a.id,a.ext_company_name,b.content,b.realname,b.mob FROM tour_customer a,tour_group_user b WHERE a.adduid=b.id
    sql_query = SELECT a.id,1 as source_id,a.ext_company_name,b.content,b.realname,b.mob FROM tour_customer a,tour_group_user b WHERE a.adduid=b.id

    # range query setup, query that must return min and max ID values
    # optional, default is empty
    #
    # sql_query will need to reference $start and $end boundaries
    # if using ranged query:
    #
    # sql_query =
    # SELECT doc.id, doc.id AS group, doc.title, doc.data
    # FROM documents doc
    # WHERE id>=$start AND id<=$end
    #
    # sql_query_range = SELECT MIN(id),MAX(id) FROM documents


    # range query step
    # optional, default is 1024
    #
    # sql_range_step = 1000


    # unsigned integer attribute declaration
    # multi-value (an arbitrary number of attributes is allowed), optional
    # optional bit size can be specified, default is 32
    #
    # sql_attr_uint = author_id
    # sql_attr_uint = forum_id:9 # 9 bits for forum_id
    sql_attr_uint = source_id

    # boolean attribute declaration
    # multi-value (an arbitrary number of attributes is allowed), optional
    # equivalent to sql_attr_uint with 1-bit size
    #
    # sql_attr_bool = is_deleted


    # bigint attribute declaration
    # multi-value (an arbitrary number of attributes is allowed), optional
    # declares a signed (unlike uint!) 64-bit attribute
    #
    # sql_attr_bigint = my_bigint_id


    # UNIX timestamp attribute declaration
    # multi-value (an arbitrary number of attributes is allowed), optional
    # similar to integer, but can also be used in date functions
    #
    # sql_attr_timestamp = posted_ts
    # sql_attr_timestamp = last_edited_ts
    #sql_attr_timestamp = date_added

    # string ordinal attribute declaration
    # multi-value (an arbitrary number of attributes is allowed), optional
    # sorts strings (bytewise), and stores their indexes in the sorted list
    # sorting by this attr is equivalent to sorting by the original strings
    #
    # sql_attr_str2ordinal = author_name


    # floating point attribute declaration
    # multi-value (an arbitrary number of attributes is allowed), optional
    # values are stored in single precision, 32-bit IEEE 754 format
    #
    # sql_attr_float = lat_radians
    # sql_attr_float = long_radians


    # multi-valued attribute (MVA) attribute declaration
    # multi-value (an arbitrary number of attributes is allowed), optional
    # MVA values are variable length lists of unsigned 32-bit integers
    #
    # syntax is ATTR-TYPE ATTR-NAME 'from' SOURCE-TYPE [;QUERY] [;RANGE-QUERY]
    # ATTR-TYPE is 'uint' or 'timestamp'
    # SOURCE-TYPE is 'field', 'query', or 'ranged-query'
    # QUERY is SQL query used to fetch all ( docid, attrvalue ) pairs
    # RANGE-QUERY is SQL query used to fetch min and max ID values, similar to 'sql_query_range'
    #
    # sql_attr_multi = uint tag from query; SELECT id, tag FROM tags
    # sql_attr_multi = uint tag from ranged-query;
    # SELECT id, tag FROM tags WHERE id>=$start AND id<=$end;
    # SELECT MIN(id), MAX(id) FROM tags


    # post-query, executed on sql_query completion
    # optional, default is empty
    #
    # sql_querny_post =


    # post-index-query, executed on successful indexing completion
    # optional, default is empty
    # $maxid expands to max document ID actually fetched from DB
    #
    # sql_query_post_index = REPLACE INTO counters ( id, val )
    # VALUES ( 'max_indexed_id', $maxid )


    # ranged query throttling, in milliseconds
    # optional, default is 0 which means no delay
    # enforces given delay before each query step
    sql_ranged_throttle = 0

    # document info query, ONLY for CLI search (ie. testing and debugging)
    # optional, default is empty
    # must contain $id macro and must fetch the document by that id
    #sql_query_info = SELECT a.id,a.ext_company_name,b.content,b.realname,b.mob FROM tour_customer a,tour_group_user b WHERE a.adduid=b.id and a.id=$id
    sql_query_info = SELECT a.id,1 as source_id,a.ext_company_name,b.content,b.realname,b.mob FROM tour_customer a,tour_group_user b WHERE a.adduid=b.id and a.id=$id

    # kill-list query, fetches the document IDs for kill-list
    # k-list will suppress matches from preceding indexes in the same query
    # optional, default is empty
    #
    # sql_query_killlist = SELECT id FROM documents WHERE edited>=@last_reindex


    # columns to unpack on indexer side when indexing
    # multi-value, optional, default is empty list
    #
    # unpack_zlib = zlib_column
    # unpack_mysqlcompress = compressed_column
    # unpack_mysqlcompress = compressed_column_2


    # maximum unpacked length allowed in MySQL COMPRESS() unpacker
    # optional, default is 16M
    #
    # unpack_mysqlcompress_maxsize = 16M


    #####################################################################
    ## xmlpipe settings
    #####################################################################

    # type = xmlpipe

    # shell command to invoke xmlpipe stream producer
    # mandatory
    #
    # xmlpipe_command = cat /usr/local/coreseek/var/test.xml

    #####################################################################
    ## xmlpipe2 settings
    #####################################################################

    # type = xmlpipe2
    # xmlpipe_command = cat /usr/local/coreseek/var/test2.xml


    # xmlpipe2 field declaration
    # multi-value, optional, default is empty
    #
    # xmlpipe_field = subject
    # xmlpipe_field = content


    # xmlpipe2 attribute declaration
    # multi-value, optional, default is empty
    # all xmlpipe_attr_XXX options are fully similar to sql_attr_XXX
    #
    # xmlpipe_attr_timestamp = published
    # xmlpipe_attr_uint = author_id


    # perform UTF-8 validation, and filter out incorrect codes
    # avoids XML parser choking on non-UTF-8 documents
    # optional, default is 0
    #
    # xmlpipe_fixup_utf8 = 1
    }


    # inherited source example
    #
    # all the parameters are copied from the parent source,
    # and may then be overridden in this source definition
    source src1throttled : src1
    {
    sql_ranged_throttle = 100
    }


    #############################################################################
    ## index definition
    #############################################################################

    # local index example
    #
    # this is an which is stored locally in the filesystem
    #
    # all indexing-time options (such as morphology and charsets)
    # are configured per local index
    index test1
    {
    # document source(s) to index
    # multi-value, mandatory
    # document IDs must be globally unique across all sources
    source = src1

    # index files path and file name, without extension
    # mandatory, path must be writable, extensions will be auto-appended
    path = /usr/local/coreseek/var/data/test1

    # document attribute values (docinfo) storage mode
    # optional, default is 'extern'
    # known values are 'none', 'extern' and 'inline'
    docinfo = extern

    # memory locking for cached data (.spa and .spi), to prevent swapping
    # optional, default is 0 (do not mlock)
    # requires searchd to be run from root
    mlock = 0

    # a list of morphology preprocessors to apply
    # optional, default is empty
    #
    # builtin preprocessors are 'none', 'stem_en', 'stem_ru', 'stem_enru',
    # 'soundex', and 'metaphone'; additional preprocessors available from
    # libstemmer are 'libstemmer_XXX', where XXX is algorithm code
    # (see libstemmer_c/libstemmer/modules.txt)
    #
    # morphology = stem_en, stem_ru, soundex
    # morphology = libstemmer_german
    # morphology = libstemmer_sv
    morphology = none

    # minimum word length at which to enable stemming
    # optional, default is 1 (stem everything)
    #
    # min_stemming_len = 1


    # stopword files list (space separated)
    # optional, default is empty
    # contents are plain text, charset_table and stemming are both applied
    #
    #stopwords = /usr/local/coreseek/var/data/stopwords.txt


    # wordforms file, in "mapfrom > mapto" plain text format
    # optional, default is empty
    #
    #wordforms = G:datawordforms.txt


    # tokenizing exceptions file
    # optional, default is empty
    #
    # plain text, case sensitive, space insensitive in map-from part
    # one "Map Several Words => ToASingleOne" entry per line
    #
    #exceptions = /data/exceptions.txt


    # minimum indexed word length
    # default is 1 (index everything)
    min_word_len = 1

    # charset encoding type
    # optional, default is 'sbcs'
    # known types are 'sbcs' (Single Byte CharSet) and 'utf-8'
    charset_type = zh_cn.utf-8
    charset_dictpath=/usr/local/mmseg3/etc

    # charset definition and case folding rules "table"
    # optional, default value depends on charset_type
    #
    # defaults are configured to include English and Russian characters only
    # you need to change the table to include additional ones
    # this behavior MAY change in future versions
    #
    # 'sbcs' default value is
    # charset_table = 0..9, A..Z->a..z, _, a..z, U+A8->U+B8, U+B8, U+C0..U+DF->U+E0..U+FF, U+E0..U+FF
    #
    # 'utf-8' default value is
    # charset_table = 0..9, A..Z->a..z, _, a..z, U+410..U+42F->U+430..U+44F, U+430..U+44F


    # ignored characters list
    # optional, default value is empty
    #
    # ignore_chars = U+00AD


    # minimum word prefix length to index
    # optional, default is 0 (do not index prefixes)
    #
    # min_prefix_len = 0


    # minimum word infix length to index
    # optional, default is 0 (do not index infixes)
    #
    # min_infix_len = 0


    # list of fields to limit prefix/infix indexing to
    # optional, default value is empty (index all fields in prefix/infix mode)
    #
    # prefix_fields = filename
    # infix_fields = url, domain


    # enable star-syntax (wildcards) when searching prefix/infix indexes
    # known values are 0 and 1
    # optional, default is 0 (do not use wildcard syntax)
    #
    # enable_star = 1


    # n-gram length to index, for CJK indexing
    # only supports 0 and 1 for now, other lengths to be implemented
    # optional, default is 0 (disable n-grams)
    #
    # ngram_len = 1


    # n-gram characters list, for CJK indexing
    # optional, default is empty
    #
    # ngram_chars = U+3000..U+2FA1F


    # phrase boundary characters list
    # optional, default is empty
    #
    # phrase_boundary = ., ?, !, U+2026 # horizontal ellipsis


    # phrase boundary word position increment
    # optional, default is 0
    #
    # phrase_boundary_step = 100


    # whether to strip HTML tags from incoming documents
    # known values are 0 (do not strip) and 1 (do strip)
    # optional, default is 0
    html_strip = 0

    # what HTML attributes to index if stripping HTML
    # optional, default is empty (do not index anything)
    #
    # html_index_attrs = img=alt,title; a=title;


    # what HTML elements contents to strip
    # optional, default is empty (do not strip element contents)
    #
    # html_remove_elements = style, script


    # whether to preopen index data files on startup
    # optional, default is 0 (do not preopen), searchd-only
    #
    # preopen = 1


    # whether to keep dictionary (.spi) on disk, or cache it in RAM
    # optional, default is 0 (cache in RAM), searchd-only
    #
    # ondisk_dict = 1


    # whether to enable in-place inversion (2x less disk, 90-95% speed)
    # optional, default is 0 (use separate temporary files), indexer-only
    #
    # inplace_enable = 1


    # in-place fine-tuning options
    # optional, defaults are listed below
    #
    # inplace_hit_gap = 0 # preallocated hitlist gap size
    # inplace_docinfo_gap = 0 # preallocated docinfo gap size
    # inplace_reloc_factor = 0.1 # relocation buffer size within arena
    # inplace_write_factor = 0.1 # write buffer size within arena

    # whether to index original keywords along with stemmed versions
    # enables "=exactform" operator to work
    # optional, default is 0
    #
    # index_exact_words = 1


    # position increment on overshort (less that min_word_len) words
    # optional, allowed values are 0 and 1, default is 1
    #
    # overshort_step = 1


    # position increment on stopword
    # optional, allowed values are 0 and 1, default is 1
    #
    # stopword_step = 1
    }

    index src1throttled : test1
    {
    source = src1throttled
    path = /usr/local/coreseek/var/data/src1throttled
    morphology = stem_en
    }


    # distributed index example
    #
    # this is a virtual index which can NOT be directly indexed,
    # and only contains references to other local and/or remote indexes
    index dist1
    {
    # 'distributed' index type MUST be specified
    type = distributed

    # local index to be searched
    # there can be many local indexes configured
    local = test1
    local = src1throttled
    local = test2
    local = src2throttled


    # remote agent
    # multiple remote agents may be specified
    # syntax for TCP connections is 'hostname:port:index1,[index2[,...]]'
    # syntax for local UNIX connections is '/path/to/socket:index1,[index2[,...]]'
    agent = localhost:9313:remote1
    agent = localhost:9314:remote2,remote3
    # agent = /var/run/searchd.sock:remote4

    # blackhole remote agent, for debugging/testing
    # network errors and search results will be ignored
    #
    # agent_blackhole = testbox:9312:testindex1,testindex2


    # remote agent connection timeout, milliseconds
    # optional, default is 1000 ms, ie. 1 sec
    agent_connect_timeout = 1000

    # remote agent query timeout, milliseconds
    # optional, default is 3000 ms, ie. 3 sec
    agent_query_timeout = 3000
    }

    #############################################################################
    ## indexer settings
    #############################################################################

    indexer
    {
    # memory limit, in bytes, kiloytes (16384K) or megabytes (256M)
    # optional, default is 32M, max is 2047M, recommended is 256M to 1024M
    mem_limit = 32M

    # maximum IO calls per second (for I/O throttling)
    # optional, default is 0 (unlimited)
    #
    # max_iops = 40


    # maximum IO call size, bytes (for I/O throttling)
    # optional, default is 0 (unlimited)
    #
    # max_iosize = 1048576


    # maximum xmlpipe2 field length, bytes
    # optional, default is 2M
    #
    # max_xmlpipe2_field = 4M


    # write buffer size, bytes
    # several (currently up to 4) buffers will be allocated
    # write buffers are allocated in addition to mem_limit
    # optional, default is 1M
    #
    # write_buffer = 1M
    }

    #############################################################################
    ## searchd settings
    #############################################################################

    searchd
    {
    # hostname, port, or hostname:port, or /unix/socket/path to listen on
    # multi-value, multiple listen points are allowed
    # optional, default is 0.0.0.0:9312 (listen on all interfaces, port 9312)
    #
    # listen = 127.0.0.1
    # listen = 192.168.0.1:9312
    # listen = 9312
    # listen = /var/run/searchd.sock


    # log file, searchd run info is logged here
    # optional, default is 'searchd.log'
    log = /usr/local/coreseek/var/log/searchd.log

    # query log file, all search queries are logged here
    # optional, default is empty (do not log queries)
    query_log = /usr/local/coreseek/var/log/query.log

    # client read timeout, seconds
    # optional, default is 5
    read_timeout = 5

    # request timeout, seconds
    # optional, default is 5 minutes
    client_timeout = 300

    # maximum amount of children to fork (concurrent searches to run)
    # optional, default is 0 (unlimited)
    max_children = 30

    # PID file, searchd process ID file name
    # mandatory
    pid_file = /usr/local/coreseek/var/log/searchd.pid

    # max amount of matches the daemon ever keeps in RAM, per-index
    # WARNING, THERE'S ALSO PER-QUERY LIMIT, SEE SetLimits() API CALL
    # default is 1000 (just like Google)
    max_matches = 1000

    # seamless rotate, prevents rotate stalls if precaching huge datasets
    # optional, default is 1
    seamless_rotate = 1

    # whether to forcibly preopen all indexes on startup
    # optional, default is 0 (do not preopen)
    preopen_indexes = 0

    # whether to unlink .old index copies on succesful rotation.
    # optional, default is 1 (do unlink)
    unlink_old = 1

    # attribute updates periodic flush timeout, seconds
    # updates will be automatically dumped to disk this frequently
    # optional, default is 0 (disable periodic flush)
    #
    # attr_flush_period = 900


    # instance-wide ondisk_dict defaults (per-index value take precedence)
    # optional, default is 0 (precache all dictionaries in RAM)
    #
    # ondisk_dict_default = 1


    # MVA updates pool size
    # shared between all instances of searchd, disables attr flushes!
    # optional, default size is 1M
    mva_updates_pool = 1M

    # max allowed network packet size
    # limits both query packets from clients, and responses from agents
    # optional, default size is 8M
    max_packet_size = 8M

    # crash log path
    # searchd will (try to) log crashed query to 'crash_log_path.PID' file
    # optional, default is empty (do not create crash logs)
    #
    # crash_log_path = /usr/local/coreseek/var/log/crash


    # max allowed per-query filter count
    # optional, default is 256
    max_filters = 256

    # max allowed per-filter values count
    # optional, default is 4096
    max_filter_values = 4096


    # socket listen queue length
    # optional, default is 5
    #
    # listen_backlog = 5


    # per-keyword read buffer size
    # optional, default is 256K
    #
    # read_buffer = 256K


    # unhinted read size (currently used when reading hits)
    # optional, default is 32K
    #
    # read_unhinted = 32K
    compat_sphinxql_magics = 0
    }

    # --eof--

  • 相关阅读:
    Eclipse下搭建Django环境
    在pycharm中启动Django服务器
    Window10安装Django,并创建第一个Django项目
    windows10安装tensorflow CPU版本
    service中显示一个dialog
    xp局域网共享访问没权限处理
    java线程调度
    Java优雅停机
    jit编译原理
    java final思考
  • 原文地址:https://www.cnblogs.com/hxl2009/p/3871597.html
Copyright © 2011-2022 走看看