DRIFT 搜索
In [1]
已复制!
# Copyright (c) 2024 Microsoft Corporation.
# Licensed under the MIT License.
# 版权所有 (c) 2024 Microsoft Corporation。 # 根据 MIT 许可证获得许可。
In [2]
已复制!
import os
from pathlib import Path
import pandas as pd
from graphrag.config.enums import ModelType
from graphrag.config.models.drift_search_config import DRIFTSearchConfig
from graphrag.config.models.language_model_config import LanguageModelConfig
from graphrag.config.models.vector_store_schema_config import VectorStoreSchemaConfig
from graphrag.language_model.manager import ModelManager
from graphrag.query.indexer_adapters import (
read_indexer_entities,
read_indexer_relationships,
read_indexer_report_embeddings,
read_indexer_reports,
read_indexer_text_units,
)
from graphrag.query.structured_search.drift_search.drift_context import (
DRIFTSearchContextBuilder,
)
from graphrag.query.structured_search.drift_search.search import DRIFTSearch
from graphrag.tokenizer.get_tokenizer import get_tokenizer
from graphrag.vector_stores.lancedb import LanceDBVectorStore
INPUT_DIR = "./inputs/operation dulce"
LANCEDB_URI = f"{INPUT_DIR}/lancedb"
COMMUNITY_REPORT_TABLE = "community_reports"
COMMUNITY_TABLE = "communities"
ENTITY_TABLE = "entities"
RELATIONSHIP_TABLE = "relationships"
COVARIATE_TABLE = "covariates"
TEXT_UNIT_TABLE = "text_units"
COMMUNITY_LEVEL = 2
# read nodes table to get community and degree data
entity_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_TABLE}.parquet")
community_df = pd.read_parquet(f"{INPUT_DIR}/{COMMUNITY_TABLE}.parquet")
print(f"Entity df columns: {entity_df.columns}")
entities = read_indexer_entities(entity_df, community_df, COMMUNITY_LEVEL)
# load description embeddings to an in-memory lancedb vectorstore
# to connect to a remote db, specify url and port values.
description_embedding_store = LanceDBVectorStore(
vector_store_schema_config=VectorStoreSchemaConfig(
index_name="default-entity-description"
),
)
description_embedding_store.connect(db_uri=LANCEDB_URI)
full_content_embedding_store = LanceDBVectorStore(
vector_store_schema_config=VectorStoreSchemaConfig(
index_name="default-community-full_content"
)
)
full_content_embedding_store.connect(db_uri=LANCEDB_URI)
print(f"Entity count: {len(entity_df)}")
entity_df.head()
relationship_df = pd.read_parquet(f"{INPUT_DIR}/{RELATIONSHIP_TABLE}.parquet")
relationships = read_indexer_relationships(relationship_df)
print(f"Relationship count: {len(relationship_df)}")
relationship_df.head()
text_unit_df = pd.read_parquet(f"{INPUT_DIR}/{TEXT_UNIT_TABLE}.parquet")
text_units = read_indexer_text_units(text_unit_df)
print(f"Text unit records: {len(text_unit_df)}")
text_unit_df.head()
import os from pathlib import Path import pandas as pd from graphrag.config.enums import ModelType from graphrag.config.models.drift_search_config import DRIFTSearchConfig from graphrag.config.models.language_model_config import LanguageModelConfig from graphrag.config.models.vector_store_schema_config import VectorStoreSchemaConfig from graphrag.language_model.manager import ModelManager from graphrag.query.indexer_adapters import ( read_indexer_entities, read_indexer_relationships, read_indexer_report_embeddings, read_indexer_reports, read_indexer_text_units, ) from graphrag.query.structured_search.drift_search.drift_context import ( DRIFTSearchContextBuilder, ) from graphrag.query.structured_search.drift_search.search import DRIFTSearch from graphrag.tokenizer.get_tokenizer import get_tokenizer from graphrag.vector_stores.lancedb import LanceDBVectorStore INPUT_DIR = "./inputs/operation dulce" LANCEDB_URI = f"{INPUT_DIR}/lancedb" COMMUNITY_REPORT_TABLE = "community_reports" COMMUNITY_TABLE = "communities" ENTITY_TABLE = "entities" RELATIONSHIP_TABLE = "relationships" COVARIATE_TABLE = "covariates" TEXT_UNIT_TABLE = "text_units" COMMUNITY_LEVEL = 2 # 读取节点表以获取社区和度数据 entity_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_TABLE}.parquet") community_df = pd.read_parquet(f"{INPUT_DIR}/{COMMUNITY_TABLE}.parquet") print(f"实体数据框列:{entity_df.columns}") entities = read_indexer_entities(entity_df, community_df, COMMUNITY_LEVEL) # 将描述嵌入加载到内存中的 lancedb 向量存储中 # 要连接到远程数据库,请指定 url 和端口值。 description_embedding_store = LanceDBVectorStore( vector_store_schema_config=VectorStoreSchemaConfig( index_name="default-entity-description" ), ) description_embedding_store.connect(db_uri=LANCEDB_URI) full_content_embedding_store = LanceDBVectorStore( vector_store_schema_config=VectorStoreSchemaConfig( index_name="default-community-full_content" ) ) full_content_embedding_store.connect(db_uri=LANCEDB_URI) print(f"实体数量:{len(entity_df)}") entity_df.head() relationship_df = pd.read_parquet(f"{INPUT_DIR}/{RELATIONSHIP_TABLE}.parquet") relationships = read_indexer_relationships(relationship_df) print(f"关系数量:{len(relationship_df)}") relationship_df.head() text_unit_df = pd.read_parquet(f"{INPUT_DIR}/{TEXT_UNIT_TABLE}.parquet") text_units = read_indexer_text_units(text_unit_df) print(f"文本单元记录:{len(text_unit_df)}") text_unit_df.head()
Entity df columns: Index(['id', 'human_readable_id', 'title', 'type', 'description',
'text_unit_ids', 'frequency', 'degree', 'x', 'y'],
dtype='object')
Entity count: 18
Relationship count: 54
Text unit records: 5
Out[2]
| id | human_readable_id | text | n_tokens | 文档 ID | 实体 ID | 关系 ID | 协变量 ID | |
|---|---|---|---|---|---|---|---|---|
| 0 | 8e938693af886bfd081acbbe8384c3671446bff84a134a... | 1 | # 行动:杜尔塞\n\n## 第一章\n\n通过... | 1200 | [6e81f882f89dd5596e1925dd3ae8a4f0a0edcb55b35a8... | [425a7862-0aef-4f69-a4c8-8bd42151c9d4, bcdbf1f... | [2bfad9f4-5abd-48d0-8db3-a9cad9120413, 6cbb838... | [745d28dd-be20-411b-85ff-1c69ca70e7b3, 9cba185... |
| 1 | fd1f46d32e1df6cd429542aeda3d64ddf3745ccb80f443... | 2 | , 海湾空洞的回声是一个鲜明的提醒... | 1200 | [6e81f882f89dd5596e1925dd3ae8a4f0a0edcb55b35a8... | [425a7862-0aef-4f69-a4c8-8bd42151c9d4, bcdbf1f... | [2bfad9f4-5abd-48d0-8db3-a9cad9120413, 6cbb838... | [4f9b461f-5e8f-465d-9586-e2fc81787062, 0f74618... |
| 2 | 7296d9a1f046854d59079dc183de8a054c27c4843d2979... | 3 | 与他人的赞扬不同。这是... | 1200 | [6e81f882f89dd5596e1925dd3ae8a4f0a0edcb55b35a8... | [425a7862-0aef-4f69-a4c8-8bd42151c9d4, bcdbf1f... | [2bfad9f4-5abd-48d0-8db3-a9cad9120413, 6cbb838... | [3ef1be9c-4080-4fac-99bd-c4a636248904, 8730b20... |
| 3 | ac72722a02ac71242a2a91fca323198d04197daf60515d... | 4 | 与笼罩着它的僵硬的沉默形成对比... | 1200 | [6e81f882f89dd5596e1925dd3ae8a4f0a0edcb55b35a8... | [425a7862-0aef-4f69-a4c8-8bd42151c9d4, bcdbf1f... | [2bfad9f4-5abd-48d0-8db3-a9cad9120413, 6cbb838... | [2c292047-b79a-4958-ab57-7bf7d7a22c92, 3cbd18a... |
| 4 | 4c277337d461a16aaf8f9760ddb8b44ef220e948a2341d... | 5 | 一副职责的面具。\n\n在下降的过程中... | 35 | [6e81f882f89dd5596e1925dd3ae8a4f0a0edcb55b35a8... | [d084d615-3584-4ec8-9931-90aa6075c764, 4b84859... | [6efdc42e-69a2-47c0-97ec-4b296cd16d5e] | [db8da02f-f889-4bb5-8e81-ab2a72e380bb] |
In [3]
已复制!
api_key = os.environ["GRAPHRAG_API_KEY"]
chat_config = LanguageModelConfig(
api_key=api_key,
type=ModelType.Chat,
model_provider="openai",
model="gpt-4.1",
max_retries=20,
)
chat_model = ModelManager().get_or_create_chat_model(
name="local_search",
model_type=ModelType.Chat,
config=chat_config,
)
tokenizer = get_tokenizer(chat_config)
embedding_config = LanguageModelConfig(
api_key=api_key,
type=ModelType.Embedding,
model_provider="openai",
model="text-embedding-3-small",
max_retries=20,
)
text_embedder = ModelManager().get_or_create_embedding_model(
name="local_search_embedding",
model_type=ModelType.Embedding,
config=embedding_config,
)
api_key = os.environ["GRAPHRAG_API_KEY"] chat_config = LanguageModelConfig( api_key=api_key, type=ModelType.Chat, model_provider="openai", model="gpt-4.1", max_retries=20, ) chat_model = ModelManager().get_or_create_chat_model( name="local_search", model_type=ModelType.Chat, config=chat_config, ) tokenizer = get_tokenizer(chat_config) embedding_config = LanguageModelConfig( api_key=api_key, type=ModelType.Embedding, model_provider="openai", model="text-embedding-3-small", max_retries=20, ) text_embedder = ModelManager().get_or_create_embedding_model( name="local_search_embedding", model_type=ModelType.Embedding, config=embedding_config, )
In [4]
已复制!
def read_community_reports(
input_dir: str,
community_report_table: str = COMMUNITY_REPORT_TABLE,
):
"""Embeds the full content of the community reports and saves the DataFrame with embeddings to the output path."""
input_path = Path(input_dir) / f"{community_report_table}.parquet"
return pd.read_parquet(input_path)
report_df = read_community_reports(INPUT_DIR)
reports = read_indexer_reports(
report_df,
community_df,
COMMUNITY_LEVEL,
content_embedding_col="full_content_embeddings",
)
read_indexer_report_embeddings(reports, full_content_embedding_store)
def read_community_reports( input_dir: str, community_report_table: str = COMMUNITY_REPORT_TABLE, ): """嵌入社区报告的完整内容并将带有嵌入的数据框保存到输出路径。""" input_path = Path(input_dir) / f"{community_report_table}.parquet" return pd.read_parquet(input_path) report_df = read_community_reports(INPUT_DIR) reports = read_indexer_reports( report_df, community_df, COMMUNITY_LEVEL, content_embedding_col="full_content_embeddings", ) read_indexer_report_embeddings(reports, full_content_embedding_store)
In [5]
已复制!
drift_params = DRIFTSearchConfig(
temperature=0,
max_tokens=12_000,
primer_folds=1,
drift_k_followups=3,
n_depth=3,
n=1,
)
context_builder = DRIFTSearchContextBuilder(
model=chat_model,
text_embedder=text_embedder,
entities=entities,
relationships=relationships,
reports=reports,
entity_text_embeddings=description_embedding_store,
text_units=text_units,
tokenizer=tokenizer,
config=drift_params,
)
search = DRIFTSearch(
model=chat_model, context_builder=context_builder, tokenizer=tokenizer
)
drift_params = DRIFTSearchConfig( temperature=0, max_tokens=12_000, primer_folds=1, drift_k_followups=3, n_depth=3, n=1, ) context_builder = DRIFTSearchContextBuilder( model=chat_model, text_embedder=text_embedder, entities=entities, relationships=relationships, reports=reports, entity_text_embeddings=description_embedding_store, text_units=text_units, tokenizer=tokenizer, config=drift_params, ) search = DRIFTSearch( model=chat_model, context_builder=context_builder, tokenizer=tokenizer )
In [6]
已复制!
resp = await search.search("Who is agent Mercer?")
resp = await search.search("默瑟探员是谁?")
0%| | 0/1 [00:00<?, ?it/s]
100%|██████████| 1/1 [00:08<00:00, 8.25s/it]
0%| | 0/3 [00:00<?, ?it/s]
Reached token limit - reverting to previous context state
Reached token limit - reverting to previous context state
Reached token limit - reverting to previous context state
33%|███▎ | 1/3 [00:12<00:24, 12.47s/it]
67%|██████▋ | 2/3 [00:15<00:07, 7.02s/it]
100%|██████████| 3/3 [00:16<00:00, 3.99s/it]
0%| | 0/3 [00:00<?, ?it/s]
Reached token limit - reverting to previous context state
Reached token limit - reverting to previous context state
Reached token limit - reverting to previous context state
33%|███▎ | 1/3 [00:13<00:26, 13.15s/it]
67%|██████▋ | 2/3 [00:14<00:06, 6.08s/it]
100%|██████████| 3/3 [00:15<00:00, 4.05s/it]
0%| | 0/3 [00:00<?, ?it/s]
Reached token limit - reverting to previous context state
Reached token limit - reverting to previous context state
Reached token limit - reverting to previous context state
33%|███▎ | 1/3 [00:12<00:24, 12.29s/it]
67%|██████▋ | 2/3 [00:14<00:06, 6.65s/it]
100%|██████████| 3/3 [00:17<00:00, 4.54s/it]
In [7]
已复制!
resp.response
resp.response
Out[7]
"Agent Alex Mercer: Role and Significance in Operation: Dulce\n------------------------------------------------------------\n\nAgent Alex Mercer is a central figure in the Paranormal Military Squad, serving as a key leader and mentor during Operation: Dulce—a mission focused on exploring the mysterious Dulce base and investigating advanced alien technology. Mercer is recognized for his leadership, adaptability, and emphasis on intuition and trust, which he instills in his team members, particularly Sam Rivera, the squad's cybersecurity expert. His mentorship strengthens team cohesion and prepares members for the unpredictable challenges of the mission.\n\nMercer collaborates closely with other prominent agents, such as Taylor Cruz (the authoritative de facto leader) and Dr. Jordan Hayes (a scientist specializing in alien technology). His professional relationship with Cruz balances adaptability with adherence to protocol, reflecting the diverse skills needed for mission success. Mercer is directly involved in the exploration of the Dulce base, engaging in investigative work, supporting scientific analysis, and maintaining team morale.\n\nOverall, Agent Mercer is depicted as a pivotal leader, mentor, and operative whose actions and relationships significantly shape the outcome of Operation: Dulce [Data: Reports (1); Sources (0, 1, 2, 3)]."
In [8]
已复制!
print(resp.context_data)
print(resp.context_data)
{'What specific actions did Agent Mercer take during Operation: Dulce?': {'reports': id title \
0 1 Paranormal Military Squad and Operation: Dulce
content
0 # Paranormal Military Squad and Operation: Dul... , 'entities': Empty DataFrame
Columns: [in_context]
Index: [], 'sources': id text
0 4 a mask of duty.\n\nIn the midst of the descen...
1 2 differently than praise from others. This was...
2 0 # Operation: Dulce\n\n## Chapter 1\n\nThe thru...
3 1 , the hollow echo of the bay a stark reminder ...
4 3 contrast to the rigid silence enveloping the ...}, "How did Agent Mercer's mentorship influence Sam Rivera's performance?": {'reports': id title \
0 1 Paranormal Military Squad and Operation: Dulce
content
0 # Paranormal Military Squad and Operation: Dul... , 'entities': Empty DataFrame
Columns: [in_context]
Index: [], 'sources': id text
0 4 a mask of duty.\n\nIn the midst of the descen...
1 2 differently than praise from others. This was...
2 1 , the hollow echo of the bay a stark reminder ...
3 3 contrast to the rigid silence enveloping the ...
4 0 # Operation: Dulce\n\n## Chapter 1\n\nThe thru...}, 'How did Agent Mercer interact with alien technology during the mission?': {'reports': id title \
0 1 Paranormal Military Squad and Operation: Dulce
content
0 # Paranormal Military Squad and Operation: Dul... , 'entities': Empty DataFrame
Columns: [in_context]
Index: [], 'sources': id text
0 2 differently than praise from others. This was...
1 3 contrast to the rigid silence enveloping the ...
2 0 # Operation: Dulce\n\n## Chapter 1\n\nThe thru...
3 1 , the hollow echo of the bay a stark reminder ...
4 4 a mask of duty.\n\nIn the midst of the descen...}, 'In what ways did Agent Mercer interact with alien technology during Operation: Dulce?': {'reports': id title \
0 1 Paranormal Military Squad and Operation: Dulce
content
0 # Paranormal Military Squad and Operation: Dul... , 'entities': Empty DataFrame
Columns: [in_context]
Index: [], 'sources': id text
0 2 differently than praise from others. This was...
1 0 # Operation: Dulce\n\n## Chapter 1\n\nThe thru...
2 1 , the hollow echo of the bay a stark reminder ...
3 3 contrast to the rigid silence enveloping the ...
4 4 a mask of duty.\n\nIn the midst of the descen...}, "How did Agent Mercer's relationship with other team members evolve throughout the operation?": {'reports': id title \
0 1 Paranormal Military Squad and Operation: Dulce
content
0 # Paranormal Military Squad and Operation: Dul... , 'entities': Empty DataFrame
Columns: [in_context]
Index: [], 'sources': id text
0 0 # Operation: Dulce\n\n## Chapter 1\n\nThe thru...
1 2 differently than praise from others. This was...
2 1 , the hollow echo of the bay a stark reminder ...
3 3 contrast to the rigid silence enveloping the ...
4 4 a mask of duty.\n\nIn the midst of the descen...}, "What specific skills did Sam Rivera demonstrate as a result of Mercer's mentorship?": {'reports': id title \
0 1 Paranormal Military Squad and Operation: Dulce
content
0 # Paranormal Military Squad and Operation: Dul... , 'entities': Empty DataFrame
Columns: [in_context]
Index: [], 'sources': id text
0 4 a mask of duty.\n\nIn the midst of the descen...
1 2 differently than praise from others. This was...
2 0 # Operation: Dulce\n\n## Chapter 1\n\nThe thru...
3 1 , the hollow echo of the bay a stark reminder ...
4 3 contrast to the rigid silence enveloping the ...}, "Were there any moments where Rivera's performance directly impacted the outcome of Operation: Dulce?": {'reports': id title \
0 1 Paranormal Military Squad and Operation: Dulce
content
0 # Paranormal Military Squad and Operation: Dul... , 'entities': Empty DataFrame
Columns: [in_context]
Index: [], 'sources': id text
0 2 differently than praise from others. This was...
1 4 a mask of duty.\n\nIn the midst of the descen...
2 0 # Operation: Dulce\n\n## Chapter 1\n\nThe thru...
3 3 contrast to the rigid silence enveloping the ...
4 1 , the hollow echo of the bay a stark reminder ...}, "Did Mercer's mentorship style differ from that of Taylor Cruz or other leaders on the team?": {'reports': id title \
0 1 Paranormal Military Squad and Operation: Dulce
content
0 # Paranormal Military Squad and Operation: Dul... , 'entities': Empty DataFrame
Columns: [in_context]
Index: [], 'sources': id text
0 4 a mask of duty.\n\nIn the midst of the descen...
1 2 differently than praise from others. This was...
2 0 # Operation: Dulce\n\n## Chapter 1\n\nThe thru...
3 1 , the hollow echo of the bay a stark reminder ...
4 3 contrast to the rigid silence enveloping the ...}, "Are there specific examples where Mercer's approach led to better outcomes than Cruz's?": {'reports': id title \
0 1 Paranormal Military Squad and Operation: Dulce
content
0 # Paranormal Military Squad and Operation: Dul... , 'entities': Empty DataFrame
Columns: [in_context]
Index: [], 'sources': id text
0 2 differently than praise from others. This was...
1 3 contrast to the rigid silence enveloping the ...
2 0 # Operation: Dulce\n\n## Chapter 1\n\nThe thru...
3 1 , the hollow echo of the bay a stark reminder ...
4 4 a mask of duty.\n\nIn the midst of the descen...}}