mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-08 12:32:30 +08:00
### What problem does this PR solve?
This PR adds the support for latest OpenSearch2.19.1 as the store engine
& search engine option for RAGFlow.
### Main Benefit
1. OpenSearch2.19.1 is licensed under the [Apache v2.0 License] which is
much better than Elasticsearch
2. For search, OpenSearch2.19.1 supports full-text
search、vector_search、hybrid_search those are similar with Elasticsearch
on schema
3. For store, OpenSearch2.19.1 stores text、vector those are quite
simliar with Elasticsearch on schema
### Changes
- Support opensearch_python_connetor. I make a lot of adaptions since
the schema and api/method between ES and Opensearch differs in many
ways(especially the knn_search has a significant gap) :
rag/utils/opensearch_coon.py
- Support static config adaptions by changing:
conf/service_conf.yaml、api/settings.py、rag/settings.py
- Supprt some store&search schema changes between OpenSearch and ES:
conf/os_mapping.json
- Support OpenSearch python sdk : pyproject.toml
- Support docker config for OpenSearch2.19.1 :
docker/.env、docker/docker-compose-base.yml、docker/service_conf.yaml.template
### How to use
- I didn't change the priority that ES as the default doc/search engine.
Only if in docker/.env , we set DOC_ENGINE=${DOC_ENGINE:-opensearch}, it
will work.
### Others
Our team tested a lot of docs in our environment by using OpenSearch as
the vector database ,it works very well.
All the conifg for OpenSearch is necessary.
### Type of change
- [x] New Feature (non-breaking change which adds functionality)
---------
Co-authored-by: Yongteng Lei <yongtengrey@outlook.com>
Co-authored-by: writinwaters <93570324+writinwaters@users.noreply.github.com>
Co-authored-by: Yingfeng <yingfeng.zhang@gmail.com>
213 lines
4.4 KiB
JSON
213 lines
4.4 KiB
JSON
{
|
|
"settings": {
|
|
"index": {
|
|
"number_of_shards": 2,
|
|
"number_of_replicas": 0,
|
|
"refresh_interval": "1000ms",
|
|
"knn": true,
|
|
"similarity": {
|
|
"scripted_sim": {
|
|
"type": "scripted",
|
|
"script": {
|
|
"source": "double idf = Math.log(1+(field.docCount-term.docFreq+0.5)/(term.docFreq + 0.5))/Math.log(1+((field.docCount-0.5)/1.5)); return query.boost * idf * Math.min(doc.freq, 1);"
|
|
}
|
|
}
|
|
}
|
|
}
|
|
},
|
|
"mappings": {
|
|
"properties": {
|
|
"lat_lon": {
|
|
"type": "geo_point",
|
|
"store": "true"
|
|
}
|
|
},
|
|
"date_detection": "true",
|
|
"dynamic_templates": [
|
|
{
|
|
"int": {
|
|
"match": "*_int",
|
|
"mapping": {
|
|
"type": "integer",
|
|
"store": "true"
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"ulong": {
|
|
"match": "*_ulong",
|
|
"mapping": {
|
|
"type": "unsigned_long",
|
|
"store": "true"
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"long": {
|
|
"match": "*_long",
|
|
"mapping": {
|
|
"type": "long",
|
|
"store": "true"
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"short": {
|
|
"match": "*_short",
|
|
"mapping": {
|
|
"type": "short",
|
|
"store": "true"
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"numeric": {
|
|
"match": "*_flt",
|
|
"mapping": {
|
|
"type": "float",
|
|
"store": true
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"tks": {
|
|
"match": "*_tks",
|
|
"mapping": {
|
|
"type": "text",
|
|
"similarity": "scripted_sim",
|
|
"analyzer": "whitespace",
|
|
"store": true
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"ltks": {
|
|
"match": "*_ltks",
|
|
"mapping": {
|
|
"type": "text",
|
|
"analyzer": "whitespace",
|
|
"store": true
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"kwd": {
|
|
"match_pattern": "regex",
|
|
"match": "^(.*_(kwd|id|ids|uid|uids)|uid)$",
|
|
"mapping": {
|
|
"type": "keyword",
|
|
"similarity": "boolean",
|
|
"store": true
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"dt": {
|
|
"match_pattern": "regex",
|
|
"match": "^.*(_dt|_time|_at)$",
|
|
"mapping": {
|
|
"type": "date",
|
|
"format": "yyyy-MM-dd HH:mm:ss||yyyy-MM-dd||yyyy-MM-dd_HH:mm:ss",
|
|
"store": true
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"nested": {
|
|
"match": "*_nst",
|
|
"mapping": {
|
|
"type": "nested"
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"object": {
|
|
"match": "*_obj",
|
|
"mapping": {
|
|
"type": "object",
|
|
"dynamic": "true"
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"string": {
|
|
"match_pattern": "regex",
|
|
"match": "^.*_(with_weight|list)$",
|
|
"mapping": {
|
|
"type": "text",
|
|
"index": "false",
|
|
"store": true
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"rank_feature": {
|
|
"match": "*_fea",
|
|
"mapping": {
|
|
"type": "rank_feature"
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"rank_features": {
|
|
"match": "*_feas",
|
|
"mapping": {
|
|
"type": "rank_features"
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"knn_vector": {
|
|
"match": "*_512_vec",
|
|
"mapping": {
|
|
"type": "knn_vector",
|
|
"index": true,
|
|
"space_type": "cosinesimil",
|
|
"dimension": 512
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"knn_vector": {
|
|
"match": "*_768_vec",
|
|
"mapping": {
|
|
"type": "knn_vector",
|
|
"index": true,
|
|
"space_type": "cosinesimil",
|
|
"dimension": 768
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"knn_vector": {
|
|
"match": "*_1024_vec",
|
|
"mapping": {
|
|
"type": "knn_vector",
|
|
"index": true,
|
|
"space_type": "cosinesimil",
|
|
"dimension": 1024
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"knn_vector": {
|
|
"match": "*_1536_vec",
|
|
"mapping": {
|
|
"type": "knn_vector",
|
|
"index": true,
|
|
"space_type": "cosinesimil",
|
|
"dimension": 1536
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"binary": {
|
|
"match": "*_bin",
|
|
"mapping": {
|
|
"type": "binary"
|
|
}
|
|
}
|
|
}
|
|
]
|
|
}
|
|
} |