Add quickwit client to benchmark (#1329)

Ma-cat · web-flow · commit 06e24b7fdf64 · 2024-06-14T19:51:58.000+08:00
### What problem does this PR solve?

* add quickwit client to benchmark
* add tantivy benchmark dataset wiki-articles

### Type of change

- [x] Other (please describe): update benchmark
diff --git a/docs/references/benchmark.md b/docs/references/benchmark.md
@@ -62,6 +62,9 @@ docker run -d --name qdrant --network host -v $HOME/qdrant/storage:/qdrant/stora
 
 mkdir -p $HOME/infinity
 docker run -d --name infinity -v $HOME/infinity:/var/infinity --ulimit nofile=500000:500000 --network=host infiniflow/infinity:0.1.0
+
+mkdir -p $HOME/quickwit
+docker run -d --rm -v $HOME/quickwit/qwdata:/quickwit/qwdata -p 127.0.0.1:7280:7280 quickwit/quickwit run
 ```
 
 4. Run Benchmark:
diff --git a/python/benchmark/clients/quickwit_client.py b/python/benchmark/clients/quickwit_client.py
@@ -0,0 +1,219 @@
+from typing import Any
+import json
+from typing import List
+import os
+import h5py
+import uuid
+import logging
+import requests
+import sys
+
+from .base_client import BaseClient
+
+
+class WrapQuickwitClient:
+    def __init__(self, base_url):
+        self.base_url = base_url
+
+    def create_index(self, index_config):
+        response = requests.post(
+            f"{self.base_url}/api/v1/indexes",
+            headers={'Content-Type': 'application/yaml'},
+            data=index_config
+        )
+        if response.status_code == 201:
+            return response.json()
+        else:
+            response.raise_for_status()
+
+    def index_exists(self, index):
+        response = requests.get(f"{self.base_url}/api/v1/indexes/{index}")
+        if response.status_code == 200:
+            return True
+        elif response.status_code == 404:
+            return False
+        else:
+            response.raise_for_status()
+
+    def delete_index(self, index):
+        response = requests.delete(f"{self.base_url}/api/v1/indexes/{index}")
+        if response.status_code == 200:
+            return response.json()
+        elif response.status_code == 404:
+            return {"error": "Index not found"}
+        else:
+            response.raise_for_status()
+
+    def upload_batch(self, index, data):
+        bulk_url = f'{self.base_url}/api/v1/{index}/ingest?commit=force'
+        response = requests.post(
+            bulk_url,
+            data=data
+        )
+        if response.status_code == 200:
+            return response.json()
+        else:
+            response.raise_for_status()
+
+    def search(self, index, query):
+        search_url = f"{self.base_url}/api/v1/_elastic/{index}/_search/"
+
+        response = requests.get(
+            search_url,
+            headers={'Content-Type': 'application/json'},
+            data=json.dumps(query)
+        )
+        if response.status_code == 200:
+            return response.json()
+        else:
+            response.raise_for_status()
+
+
+class QuickwitClient(BaseClient):
+    def __init__(self, conf_path: str) -> None:
+        """
+        The mode configuration file is parsed to extract the needed parameters, which are then all stored for use by other functions.
+        """
+        BaseClient.__init__(self, conf_path)
+        with open(conf_path, "r") as f:
+            self.data = json.load(f)
+        self.client = WrapQuickwitClient(base_url=self.data["connection_url"])
+        self.table_name = self.data["name"]
+        self.path_prefix = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+        logging.getLogger("quickwit_transport").setLevel(logging.WARNING)
+
+    def upload_batch(self, actions: List):
+        self.client.upload_batch(self.table_name, actions)
+
+    def upload(self):
+        """
+        Upload data and build indexes (parameters are parsed by __init__).
+        """
+        if self.client.index_exists(index=self.table_name):
+            self.client.delete_index(index=self.table_name)
+
+        index_config_path = os.path.join(self.path_prefix, self.data["index_config_path"])
+        with open(index_config_path, 'rb') as file:
+            yaml_data = file.read()
+            self.client.create_index(index_config=yaml_data)
+
+        batch_size = self.data["insert_batch_size"]
+        dataset_path = os.path.join(self.path_prefix, self.data["data_path"])
+        if not os.path.exists(dataset_path):
+            self.download_data(self.data["data_link"], dataset_path)
+        _, ext = os.path.splitext(dataset_path)
+        # quickwit import data every batch cannot exceed 10MB
+        MAX_DATA_SIZE = 10 * 1024 * 1024
+        if ext == ".json":
+            with open(dataset_path, "r") as f:
+                bulk_request = ""
+                for i, line in enumerate(f):
+                    record = json.dumps(json.loads(line))
+                    record_str = f"{record}\n"
+                    if sys.getsizeof(bulk_request) + sys.getsizeof(record_str) >= MAX_DATA_SIZE:
+                        self.upload_batch(bulk_request)
+                        bulk_request = record_str
+                    else:
+                        bulk_request += record_str
+
+                    if i % 1000000 == 0 and i != 0:
+                        logging.info(f"row {i}")
+                if len(bulk_request) != 0:
+                    self.upload_batch(bulk_request)
+        elif ext == ".hdf5" and self.data["mode"] == "vector":
+            with h5py.File(dataset_path, "r") as f:
+                actions = []
+                for i, line in enumerate(f["train"]):
+                    if i % batch_size == 0 and i != 0:
+                        self.upload_batch(actions)
+                        actions = []
+                    record = {self.data["vector_name"]: line}
+                    actions.append(
+                        {
+                            "_index": self.table_name,
+                            "_id": uuid.UUID(int=i).hex,
+                            "_source": record,
+                        }
+                    )
+                if actions:
+                    self.upload_batch(actions)
+        elif ext == ".csv":
+            custom_headers = []
+            headers = self.data["index"]["doc_mapping"]["field_mappings"]
+            for key in headers:
+                custom_headers.append(key["name"])
+            with open(
+                    dataset_path, "r", encoding="utf-8", errors="replace"
+            ) as data_file:
+                bulk_request = ""
+                cnt = 0
+                for i, line in enumerate(data_file):
+                    row = line.strip().split("\t")
+                    if len(row) != len(headers):
+                        logging.info(
+                            f"row = {i}, row_len = {len(row)}, not equal headers len, skip"
+                        )
+                        continue
+                    row_dict = {header["name"]: value for header, value in zip(headers, row)}
+                    document = json.dumps(row_dict)
+                    bulk_request += f"{document}\n"
+                    cnt += 1
+                    if cnt >= batch_size:
+                        self.upload_batch(bulk_request)
+                        bulk_request = ""
+                        cnt = 0
+
+                if cnt != 0:
+                    self.upload_batch(bulk_request)
+                    cnt = 0
+        else:
+            raise TypeError("Unsupported file type")
+
+    def get_fulltext_query_content(self, query: str, is_and: bool = False) -> Any:
+        ret = None
+        if is_and:
+            terms = query.split()
+            ret = {
+                "query": {
+                    "bool": {"must": [{"match": {"body": term}} for term in terms]}
+                }
+            }
+        else:
+            ret = {
+                "query": {
+                    "query_string": {
+                        "query": query,
+                        "fields": [
+                            "body"
+                        ]
+                    }
+                },
+                "sort": ["_score"]
+            }
+        return ret
+
+    def setup_clients(self, num_threads=1):
+        self.clients = list()
+        for i in range(num_threads):
+            client = WrapQuickwitClient(self.data["connection_url"])
+            self.clients.append(client)
+
+    def do_single_query(self, query_id, client_id) -> list[Any]:
+        query = self.queries[query_id]
+        client = self.clients[client_id]
+        if self.data["mode"] == "fulltext":
+            body = self.get_fulltext_query_content(query)
+            body["size"] = self.data["topK"]
+
+            result = client.search(
+                index=self.table_name,
+                query=body,
+            )
+            result = [
+                # todo add _id
+                (0, hit["sort"][0])
+                for hit in result["hits"]["hits"]
+            ]
+            return result
+        else:
+            raise TypeError("Unsupported data mode {}".format(self.data["mode"]))
diff --git a/python/benchmark/configs/elasticsearch_tantivy.json b/python/benchmark/configs/elasticsearch_tantivy.json
@@ -0,0 +1,36 @@
+{
+  "name": "elasticsearch_tantivy",
+  "app": "elasticsearch",
+  "app_path": "servers/elasticsearch/",
+  "connection_url": "http://localhost:9200",
+  "data_path": "datasets/tantivy/wiki-articles.json",
+  "insert_batch_size": 8192,
+  "query_path": "datasets/tantivy/operations.txt",
+  "result_path": "datasets/tantivy/elasticsearch_result.jsonl",
+  "mode": "fulltext",
+  "topK": 10,
+  "index": {
+    "settings": {
+      "index.number_of_shards": 5,
+      "index.number_of_replicas": 0,
+      "index.requests.cache.enable": false
+    },
+    "mappings": {
+      "_source": {
+        "enabled": true
+      },
+      "dynamic": "strict",
+      "properties": {
+        "url": {
+          "type": "text"
+        },
+        "title": {
+          "type": "text"
+        },
+        "body": {
+          "type": "text"
+        }
+      }
+    }
+  }
+}
diff --git a/python/benchmark/configs/infinity_tantivy.json b/python/benchmark/configs/infinity_tantivy.json
@@ -0,0 +1,30 @@
+{
+  "name": "infinity_tantivy",
+  "app": "infinity",
+  "host": "127.0.0.1:23817",
+  "data_path": "datasets/tantivy/wiki-articles.json",
+  "data_link": "http://192.168.200.183:8000/wiki-articles.json.bz2",
+  "insert_batch_size": 8192,
+  "query_path": "datasets/tantivy/operations.txt",
+  "result_path": "datasets/tantivy/infinity_result.jsonl",
+  "query_link": "to_be_set",
+  "mode": "fulltext",
+  "topK": 10,
+  "use_import": false,
+  "schema": {
+    "url": {"type": "varchar", "default":""},
+    "title": {"type": "varchar", "default":""},
+    "body": {"type": "varchar", "default":""}
+  },
+  "index": {
+    "url": {
+      "type": "text"
+    },
+    "title": {
+      "type": "text"
+    },
+    "body": {
+      "type": "text"
+    }
+  }
+}
diff --git a/python/benchmark/configs/quickwit_enwiki.json b/python/benchmark/configs/quickwit_enwiki.json
@@ -0,0 +1,25 @@
+{
+  "name": "quickwit_enwiki",
+  "app": "quickwit",
+  "app_path": "servers/quickwit/",
+  "connection_url": "http://localhost:7280",
+  "data_path": "datasets/enwiki/enwiki.csv",
+  "insert_batch_size": 8192,
+  "query_path": "datasets/enwiki/operations.txt",
+  "result_path": "datasets/enwiki/quickwit_result.jsonl",
+  "mode": "fulltext",
+  "topK": 10,
+  "index_config_path": "configs/quickwit_enwiki_index.yaml",
+  "index": {
+    "version": "0.8",
+    "index_id": "quickwit_enwiki",
+    "doc_mapping": {
+      "mode": "lenient",
+      "field_mappings": [
+        { "name": "doctitle", "type": "text" },
+        { "name": "docdate", "type": "text" },
+        { "name": "body", "type": "text" }
+      ]
+    }
+  }
+}
diff --git a/python/benchmark/configs/quickwit_enwiki_index.yaml b/python/benchmark/configs/quickwit_enwiki_index.yaml
@@ -0,0 +1,28 @@
+version: 0.8
+
+index_id: quickwit_enwiki
+
+doc_mapping:
+  field_mappings:
+    - name: doctitle
+      type: text
+      tokenizer: default
+      record: position
+      stored: true
+    - name: docdate
+      type: text
+      tokenizer: default
+      record: position
+      stored: true
+    - name: body
+      type: text
+      tokenizer: default
+      record: position
+      stored: true
+
+
+search_settings:
+  default_search_fields: [body]
+
+indexing_settings:
+  commit_timeout_secs: 10
diff --git a/python/benchmark/configs/quickwit_tantivy.json b/python/benchmark/configs/quickwit_tantivy.json
@@ -0,0 +1,25 @@
+{
+  "name": "quickwit_tantivy",
+  "app": "quickwit",
+  "app_path": "servers/quickwit/",
+  "connection_url": "http://localhost:7280",
+  "data_path": "datasets/tantivy/wiki-articles.json",
+  "insert_batch_size": 8192,
+  "query_path": "datasets/tantivy/operations.txt",
+  "result_path": "datasets/tantivy/quickwit_result.jsonl",
+  "mode": "fulltext",
+  "topK": 10,
+  "index_config_path": "configs/quickwit_tantivy_index.yaml",
+  "index": {
+    "version": "0.8",
+    "index_id": "quickwit_tantivy",
+    "doc_mapping": {
+      "mode": "lenient",
+      "field_mappings": [
+        { "name": "url", "type": "text" },
+        { "name": "title", "type": "text" },
+        { "name": "body", "type": "text" }
+      ]
+    }
+  }
+}
diff --git a/python/benchmark/configs/quickwit_tantivy_index.yaml b/python/benchmark/configs/quickwit_tantivy_index.yaml
diff --git a/python/benchmark/run.py b/python/benchmark/run.py