aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorneodarz <neodarz@neodarz.net>2019-01-14 22:20:36 +0100
committerneodarz <neodarz@neodarz.net>2019-01-14 22:20:43 +0100
commit65ae9c9ec577076bf7f1355166763fbf691a9349 (patch)
treea9e5b199b1c501952bb13a37a75bf13e8fd294d7
parent07cc1fdc7e79468a6e8d9b53048712b652f8da84 (diff)
downloadkhanindexer-65ae9c9ec577076bf7f1355166763fbf691a9349.tar.xz
khanindexer-65ae9c9ec577076bf7f1355166763fbf691a9349.zip
Add mantacoresearch part
Diffstat (limited to '')
-rw-r--r--README.md53
-rw-r--r--sphinx_search.conf39
2 files changed, 87 insertions, 5 deletions
diff --git a/README.md b/README.md
index 1944f82..69d329d 100644
--- a/README.md
+++ b/README.md
@@ -8,6 +8,23 @@ It is recommended to use [virtualenv](https://virtualenv.pypa.io).
pip install -r requirements.txt
```
+## Testing
+
+If you just want to test and don't want to install a PostgreSQL database
+but have Docker installed, juste use the `docker-compose.yml`.
+
+This is only for test, don't use this shit on production (the docker-compose
+file)!
+
+## Sphinx-search / Manticore-search
+
+You can use [Sphinx-search](http://sphinxsearch.com/) but it's recommand to use
+[Manticore-search](https://manticoresearch.com/) since the last version of
+sphinx-search is ditribued in closed-source instead of open-source (for
+version 3.x).
+
+All explication is for Manticore-search for the moment.
+
# Crawling
For now there is an example spider with neodarz website.
@@ -19,10 +36,36 @@ python app.py
The database is in the sqlite file `khanindexer.db` at the root of the project.
-# Testing
+# Indexing
-If you just want to test and don't want to install a PostgreSQL database
-but have Docker installed, juste use the `docker-compose.yml`.
+Before lauch indexing or searching command you must verifiy that the folder of
+`path` option is present in your system (Warning: the last word of the `path`
+option is the value of the `source` option, don't create this folder but only
+his parent folder).
-This is only for test, don't use this shit on production (the docker-compose
-file)!
+Example with the configuration for the indexer `datas`:
+
+```
+index datas {
+ source = datas
+ path = /tmp/data/datas
+}
+```
+Here the folder is `/tmp/data/`
+
+The command for indexing is:
+```
+indexer --config sphinx_search.conf --all
+```
+
+# Searching
+
+Before you can make search, you must lauch the search server
+```
+searchd -c sphinx_search.conf
+```
+
+Example search command:
+```
+curl -X POST '127.0.0.1:8080/search' -d 'index=datas&match=@content livet&select=id&limit=5' --output -
+```
diff --git a/sphinx_search.conf b/sphinx_search.conf
new file mode 100644
index 0000000..0259dc0
--- /dev/null
+++ b/sphinx_search.conf
@@ -0,0 +1,39 @@
+source datas {
+ type = pgsql
+
+ sql_host = 127.0.0.1
+ sql_user = root
+ sql_pass = root
+ sql_db = khanindexer
+
+ sql_query = SELECT id, url, title, content FROM page
+
+}
+
+index datas {
+ source = datas
+ path = /tmp/data/datas
+}
+
+indexer {
+ mem_limit = 32M
+}
+
+searchd {
+ listen = localhost:8080:http
+ query_log = /tmp/query.log
+ binlog_path = /tmp/
+ read_timeout = 5
+ client_timeout = 300
+ max_children = 30
+ persistent_connections_limit = 30
+ pid_file = /tmp/searchd.pid
+ seamless_rotate = 1
+ preopen_indexes = 1
+ unlink_old = 1
+ mva_updates_pool = 1M
+ max_packet_size = 8M
+ max_filter_values = 4096
+ max_batch_queries = 32
+ workers = threads_pool
+}