ubuntu下的中文搜索sphinx的安装配置

默北 Sphinxubuntu下的中文搜索sphinx的安装配置已关闭评论7,724字数 5158阅读17分11秒阅读模式

ubuntu下的中文搜索sphinx的安装配置

一.安装依赖包文章源自运维生存时间-https://www.ttlsa.com/sphinx/install-sphinx-on-ubuntu-2/

$ sudo apt-get install make gcc g++ automake libtool mysql-client libmysqlclient15-dev libxml2-dev libexpat1-dev文章源自运维生存时间-https://www.ttlsa.com/sphinx/install-sphinx-on-ubuntu-2/

二.安装中文分词文章源自运维生存时间-https://www.ttlsa.com/sphinx/install-sphinx-on-ubuntu-2/

$ sudo wget -c http://www.coreseek.cn/uploads/csft/3.1/Source/mmseg-3.1.tar.gz文章源自运维生存时间-https://www.ttlsa.com/sphinx/install-sphinx-on-ubuntu-2/

$ sudo tar zxvf mmseg-3.1.tar.gz -C ../software/文章源自运维生存时间-https://www.ttlsa.com/sphinx/install-sphinx-on-ubuntu-2/

$ sudo ./configure --prefix=/usr/local/mmseg文章源自运维生存时间-https://www.ttlsa.com/sphinx/install-sphinx-on-ubuntu-2/

$ sudo make文章源自运维生存时间-https://www.ttlsa.com/sphinx/install-sphinx-on-ubuntu-2/

$ sudo make install文章源自运维生存时间-https://www.ttlsa.com/sphinx/install-sphinx-on-ubuntu-2/

$ sudo mkdir dict文章源自运维生存时间-https://www.ttlsa.com/sphinx/install-sphinx-on-ubuntu-2/

$ sudo cp /usr/local/src/tarbag/words.txt.uni ./uni.lib文章源自运维生存时间-https://www.ttlsa.com/sphinx/install-sphinx-on-ubuntu-2/

$ sudo vim mmseg.ini文章源自运维生存时间-https://www.ttlsa.com/sphinx/install-sphinx-on-ubuntu-2/

[mmseg]文章源自运维生存时间-https://www.ttlsa.com/sphinx/install-sphinx-on-ubuntu-2/

merge_number_and_asci=1; //字母和数字连续出现是否切分文章源自运维生存时间-https://www.ttlsa.com/sphinx/install-sphinx-on-ubuntu-2/

number_and_asci_joint=-.; //连接数字和字母可用的符号文章源自运维生存时间-https://www.ttlsa.com/sphinx/install-sphinx-on-ubuntu-2/

compress_space=0;文章源自运维生存时间-https://www.ttlsa.com/sphinx/install-sphinx-on-ubuntu-2/

seperate_number_asci=1; //是否拆分数字文章源自运维生存时间-https://www.ttlsa.com/sphinx/install-sphinx-on-ubuntu-2/

三.安装sphinx文章源自运维生存时间-https://www.ttlsa.com/sphinx/install-sphinx-on-ubuntu-2/

$ sudo wget http://www.coreseek.cn/uploads/csft/3.1/Source/csft-3.1.tar.gz文章源自运维生存时间-https://www.ttlsa.com/sphinx/install-sphinx-on-ubuntu-2/

$ sudo tar zxvf csft-3.1.tar.gz -C ../software/文章源自运维生存时间-https://www.ttlsa.com/sphinx/install-sphinx-on-ubuntu-2/

$ sudo ./configure --prefix=/usr/local/csft --with-mysql=/usr/local/mysql --with-mysql-includes=/usr/local/mysql/include --with-mysql-libs=/usr/local/mysql/lib --with-mmseg=/usr/local/mmseg --with-mmseg-includes=/usr/local/mmseg/include/mmseg --with-mmseg-libs=/usr/local/mmseg/lib文章源自运维生存时间-https://www.ttlsa.com/sphinx/install-sphinx-on-ubuntu-2/

$ sudo make

$ sudo make install

四.新建sph_counter表

CREATE TABLE `sph_counter` (

`counter_id` int(11) NOT NULL,

`max_doc_id` int(11) NOT NULL,

PRIMARY KEY (`counter_id`)

) ENGINE=InnoDB DEFAULT CHARSET=utf8

五.配置

$ cd /usr/local/csft/etc/

$ sudo cp sphinx.conf.dist sphinx.conf

$ sudo vim sphinx.conf

source bbs

{

type = mysql

sql_host = localhost

sql_user = root

sql_pass =

sql_db = test

sql_sock = /tmp/mysqld.sock

sql_query_pre = SET NAMES utf8

sql_query_pre = SET SESSION query_cache_type=OFF

sql_query_pre = REPLACE INTO sph_counter SELECT 1,MAX(pid) FROM pre_forum_post

sql_query = \

SELECT pid, fid, tid, first, invisible, authorid, dateline, subject, message \

FROM pre_forum_post \

WHERE pid<=(SELECT max_doc_id FROM sph_counter WHERE counter_id=1)

sql_attr_uint = fid

sql_attr_uint = tid

sql_attr_uint = first

sql_attr_uint = invisible

sql_attr_uint = authorid

sql_attr_timestamp = dateline

sql_query_info = SELECT * FROM documents WHERE id=$id

}

source bbs_delta : bbs

{

sql_query_pre = SET NAMES utf8

sql_query_pre = SET SESSION query_cache_type=OFF

sql_query = \

SELECT pid, fid, tid, first, invisible, authorid, dateline, subject, message \

FROM pre_forum_post \

WHERE pid>(SELECT max_doc_id FROM sph_counter WHERE counter_id=1)

}

source bbs_merge : bbs

{

sql_query_pre = SET NAMES utf8

sql_query_pre = SET SESSION query_cache_type=OFF

sql_query = \

SELECT pid, fid, tid, first, invisible, authorid, dateline, subject, message \

FROM pre_forum_post \

WHERE pid>(SELECT max_doc_id FROM sph_counter WHERE counter_id=1)

sql_query_post = REPLACE INTO sph_counter SELECT 1, MAX(pid) FROM pre_forum_post

}

index bbs

{

source = bbs

path = /usr/local/csft/var/data/bbs

docinfo = extern

mlock = 0

morphology = none

min_word_len = 1

charset_type = zh_cn.utf-8

charset_dictpath = /usr/local/mmseg/dict

html_strip = 0

}

index bbs_delta : bbs

{

source = bbs

path = /usr/local/csft/var/data/bbs_delta

}

index bbs_merge : bbs

{

source = bbs

path = /usr/local/csft/var/data/bbs_merge

}

indexer

{

mem_limit = 256M

}

searchd

{

log = /usr/local/csft/var/log/searchd.log

query_log = /usr/local/csft/var/log/query.log

read_timeout = 5

client_timeout = 300

max_children = 30

pid_file = /usr/local/csft/var/log/searchd.pid

max_matches = 1000

seamless_rotate = 1

preopen_indexes = 0

unlink_old = 1

mva_updates_pool = 1M

max_packet_size = 8M

max_filters = 256

max_filter_values = 4096

}

六.生成索引

$ sudo /usr/local/csft/bin/indexer --config /usr/local/csft/etc/sphinx.conf --all

Coreseek Full Text Server 3.1

Copyright (c) 2006-2008 coreseek.com

using config file '/usr/local/csft/etc/sphinx.conf'...

indexing index 'bbs'...

iniparser: cannot open /usr/local/mmseg/dict/mmseg.ini

collected 3 docs, 0.0 MB

sorted 0.0 Mhits, 100.0% done

total 3 docs, 39578 bytes

total 0.050 sec, 799410.19 bytes/sec, 60.60 docs/sec

indexing index 'bbs_delta'...

collected 3 docs, 0.0 MB

sorted 0.0 Mhits, 100.0% done

total 3 docs, 39578 bytes

total 0.044 sec, 902329.94 bytes/sec, 68.40 docs/sec

indexing index 'bbs_merge'...

collected 3 docs, 0.0 MB

sorted 0.0 Mhits, 100.0% done

total 3 docs, 39578 bytes

total 0.022 sec, 1767980.00 bytes/sec, 134.01 docs/sec

total 9 reads, 0.0 sec, 21.3 kb/read avg, 0.0 msec/read avg

total 21 writes, 0.0 sec, 10.9 kb/write avg, 0.0 msec/write avg

七.测试

$ sudo /usr/local/csft/bin/search --config /usr/local/csft/etc/sphinx.conf "盛大"

Coreseek Full Text Server 3.1

Copyright (c) 2006-2008 coreseek.com

using config file '/usr/local/csft/etc/sphinx.conf'...

index 'bbs': query '盛大 ': returned 1 matches of 1 total in 0.004 sec

displaying matches:

1. document=29, weight=2, fid=33, tid=20, first=1, invisible=0, authorid=2, dateline=Thu Dec 23 06:14:00 2004

words:

1. '盛': 1 documents, 1 hits

2. '大': 2 documents, 54 hits

index 'bbs_delta': query '盛大 ': returned 1 matches of 1 total in 0.000 sec

displaying matches:

1. document=29, weight=2, fid=33, tid=20, first=1, invisible=0, authorid=2, dateline=Thu Dec 23 06:14:00 2004

words:

1. '盛': 1 documents, 1 hits

2. '大': 2 documents, 54 hits

index 'bbs_merge': query '盛大 ': returned 1 matches of 1 total in 0.000 sec

displaying matches:

1. document=29, weight=2, fid=33, tid=20, first=1, invisible=0, authorid=2, dateline=Thu Dec 23 06:14:00 2004

words:

1. '盛': 1 documents, 1 hits

2. '大': 2 documents, 54 hits

八.启动searchd

$ sudo /usr/local/csft/bin/searchd --config /usr/local/csft/etc/sphinx.conf

Coreseek Full Text Server 3.1

Copyright (c) 2006-2008 coreseek.com

using config file '/usr/local/csft/etc/sphinx.conf'...

listening on all interfaces, port=3312

九.计划任务更新合并索引

$ sudo crontab -e

# m h dom mon dow command

*/5 * * * * /usr/local/csft/bin/indexer --config /usr/local/csft/etc/sphinx.conf bbs_delta --rotate

00 04 * * * /usr/local/csft/bin/indexer --config /usr/local/csft/etc/sphinx.conf bbs_merge --rotate && /usr/local/csft/bin/indexer --config /usr/local/csft/etc/sphinx.conf --merge bbs bbs_merge --rotate

sphinx-1.x版本会实时索引的。

weinxin
我的微信
微信公众号
扫一扫关注运维生存时间公众号,获取最新技术文章~
默北
  • 本文由 发表于 06/04/2012 19:49:43
  • 转载请务必保留本文链接:https://www.ttlsa.com/sphinx/install-sphinx-on-ubuntu-2/