From 65ae9c9ec577076bf7f1355166763fbf691a9349 Mon Sep 17 00:00:00 2001 From: neodarz Date: Mon, 14 Jan 2019 22:20:36 +0100 Subject: Add mantacoresearch part --- README.md | 53 ++++++++++++++++++++++++++++++++++++++++++++++++----- sphinx_search.conf | 39 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 87 insertions(+), 5 deletions(-) create mode 100644 sphinx_search.conf diff --git a/README.md b/README.md index 1944f82..69d329d 100644 --- a/README.md +++ b/README.md @@ -8,6 +8,23 @@ It is recommended to use [virtualenv](https://virtualenv.pypa.io). pip install -r requirements.txt ``` +## Testing + +If you just want to test and don't want to install a PostgreSQL database +but have Docker installed, juste use the `docker-compose.yml`. + +This is only for test, don't use this shit on production (the docker-compose +file)! + +## Sphinx-search / Manticore-search + +You can use [Sphinx-search](http://sphinxsearch.com/) but it's recommand to use +[Manticore-search](https://manticoresearch.com/) since the last version of +sphinx-search is ditribued in closed-source instead of open-source (for +version 3.x). + +All explication is for Manticore-search for the moment. + # Crawling For now there is an example spider with neodarz website. @@ -19,10 +36,36 @@ python app.py The database is in the sqlite file `khanindexer.db` at the root of the project. -# Testing +# Indexing -If you just want to test and don't want to install a PostgreSQL database -but have Docker installed, juste use the `docker-compose.yml`. +Before lauch indexing or searching command you must verifiy that the folder of +`path` option is present in your system (Warning: the last word of the `path` +option is the value of the `source` option, don't create this folder but only +his parent folder). -This is only for test, don't use this shit on production (the docker-compose -file)! +Example with the configuration for the indexer `datas`: + +``` +index datas { + source = datas + path = /tmp/data/datas +} +``` +Here the folder is `/tmp/data/` + +The command for indexing is: +``` +indexer --config sphinx_search.conf --all +``` + +# Searching + +Before you can make search, you must lauch the search server +``` +searchd -c sphinx_search.conf +``` + +Example search command: +``` +curl -X POST '127.0.0.1:8080/search' -d 'index=datas&match=@content livet&select=id&limit=5' --output - +``` diff --git a/sphinx_search.conf b/sphinx_search.conf new file mode 100644 index 0000000..0259dc0 --- /dev/null +++ b/sphinx_search.conf @@ -0,0 +1,39 @@ +source datas { + type = pgsql + + sql_host = 127.0.0.1 + sql_user = root + sql_pass = root + sql_db = khanindexer + + sql_query = SELECT id, url, title, content FROM page + +} + +index datas { + source = datas + path = /tmp/data/datas +} + +indexer { + mem_limit = 32M +} + +searchd { + listen = localhost:8080:http + query_log = /tmp/query.log + binlog_path = /tmp/ + read_timeout = 5 + client_timeout = 300 + max_children = 30 + persistent_connections_limit = 30 + pid_file = /tmp/searchd.pid + seamless_rotate = 1 + preopen_indexes = 1 + unlink_old = 1 + mva_updates_pool = 1M + max_packet_size = 8M + max_filter_values = 4096 + max_batch_queries = 32 + workers = threads_pool +} -- cgit v1.2.1