全局搜索
In [1]
已复制!
# Copyright (c) 2024 Microsoft Corporation.
# Licensed under the MIT License.
# Copyright (c) 2024 Microsoft Corporation. # Licensed under the MIT License.
In [2]
已复制!
import os
import pandas as pd
import tiktoken
from graphrag.config.enums import ModelType
from graphrag.config.models.language_model_config import LanguageModelConfig
from graphrag.language_model.manager import ModelManager
from graphrag.query.indexer_adapters import (
read_indexer_communities,
read_indexer_entities,
read_indexer_reports,
)
from graphrag.query.structured_search.global_search.community_context import (
GlobalCommunityContext,
)
from graphrag.query.structured_search.global_search.search import GlobalSearch
import os import pandas as pd import tiktoken from graphrag.config.enums import ModelType from graphrag.config.models.language_model_config import LanguageModelConfig from graphrag.language_model.manager import ModelManager from graphrag.query.indexer_adapters import ( read_indexer_communities, read_indexer_entities, read_indexer_reports, ) from graphrag.query.structured_search.global_search.community_context import ( GlobalCommunityContext, ) from graphrag.query.structured_search.global_search.search import GlobalSearch
全局搜索示例¶
全局搜索方法通过以 map-reduce 的方式搜索所有 AI 生成的社区报告来生成答案。这是一种资源密集型方法,但通常可以为需要理解整个数据集的问题提供良好的响应(例如,此笔记本中提到的草药的最重要值是什么?)。
LLM 设置¶
In [3]
已复制!
api_key = os.environ["GRAPHRAG_API_KEY"]
llm_model = os.environ["GRAPHRAG_LLM_MODEL"]
config = LanguageModelConfig(
api_key=api_key,
type=ModelType.OpenAIChat,
model=llm_model,
max_retries=20,
)
model = ModelManager().get_or_create_chat_model(
name="global_search",
model_type=ModelType.OpenAIChat,
config=config,
)
token_encoder = tiktoken.encoding_for_model(llm_model)
api_key = os.environ["GRAPHRAG_API_KEY"] llm_model = os.environ["GRAPHRAG_LLM_MODEL"] config = LanguageModelConfig( api_key=api_key, type=ModelType.OpenAIChat, model=llm_model, max_retries=20, ) model = ModelManager().get_or_create_chat_model( name="global_search", model_type=ModelType.OpenAIChat, config=config, ) token_encoder = tiktoken.encoding_for_model(llm_model)
加载社区报告作为全局搜索的上下文¶
- 从 GraphRAG 加载
community_reports
表中的所有社区报告,用作全局搜索的上下文数据。 - 从 GraphRAG 的
entities
表加载实体,用于计算上下文排序的社区权重。请注意,这是可选的(如果没有提供实体,我们将不计算社区权重,仅使用 community_reports 表中的 rank 属性进行上下文排名) - 从 GraphRAG 的
communities
表加载所有社区,用于重建动态社区选择的社区图层次结构。
In [4]
已复制!
# parquet files generated from indexing pipeline
INPUT_DIR = "./inputs/operation dulce"
COMMUNITY_TABLE = "communities"
COMMUNITY_REPORT_TABLE = "community_reports"
ENTITY_TABLE = "entities"
# community level in the Leiden community hierarchy from which we will load the community reports
# higher value means we use reports from more fine-grained communities (at the cost of higher computation cost)
COMMUNITY_LEVEL = 2
# 从索引管道生成的 parquet 文件 INPUT_DIR = "./inputs/operation dulce" COMMUNITY_TABLE = "communities" COMMUNITY_REPORT_TABLE = "community_reports" ENTITY_TABLE = "entities" # 我们将从中加载社区报告的 Leiden 社区层次结构中的社区级别 # 值越高意味着我们使用来自更细粒度社区的报告(以更高的计算成本为代价) COMMUNITY_LEVEL = 2
In [5]
已复制!
community_df = pd.read_parquet(f"{INPUT_DIR}/{COMMUNITY_TABLE}.parquet")
entity_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_TABLE}.parquet")
report_df = pd.read_parquet(f"{INPUT_DIR}/{COMMUNITY_REPORT_TABLE}.parquet")
communities = read_indexer_communities(community_df, report_df)
reports = read_indexer_reports(report_df, community_df, COMMUNITY_LEVEL)
entities = read_indexer_entities(entity_df, community_df, COMMUNITY_LEVEL)
print(f"Total report count: {len(report_df)}")
print(
f"Report count after filtering by community level {COMMUNITY_LEVEL}: {len(reports)}"
)
report_df.head()
community_df = pd.read_parquet(f"{INPUT_DIR}/{COMMUNITY_TABLE}.parquet") entity_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_TABLE}.parquet") report_df = pd.read_parquet(f"{INPUT_DIR}/{COMMUNITY_REPORT_TABLE}.parquet") communities = read_indexer_communities(community_df, report_df) reports = read_indexer_reports(report_df, community_df, COMMUNITY_LEVEL) entities = read_indexer_entities(entity_df, community_df, COMMUNITY_LEVEL) print(f"报告总数: {len(report_df)}") print( f"按社区级别 {COMMUNITY_LEVEL} 过滤后的报告数: {len(reports)}" ) report_df.head()
Total report count: 2 Report count after filtering by community level 2: 2
Out[5]
id | human_readable_id | community | level | parent | children | title | summary | full_content | rank | rating_explanation | findings | full_content_json | period | size | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 6c3a555680d647ac8be866a129c7b0ea | 0 | 0 | 0 | -1 | [] | Operation: Dulce and Dulce Base Exploration | The community revolves around 'Operation: Dulc... | # Operation: Dulce and Dulce Base Exploration\... | 8.5 | The impact severity rating is high due to the ... | [{'explanation': 'Operation: Dulce is a signif... | {\n "title": "Operation: Dulce and Dulce Ba... | 2025-03-04 | 7 |
1 | 0127331a1ea34b8ba19de2c2a4cb3bc9 | 1 | 1 | 0 | -1 | [] | Paranormal Military Squad and Operation: Dulce | The community centers around the Paranormal Mi... | # Paranormal Military Squad and Operation: Dul... | 8.5 | The impact severity rating is high due to the ... | [{'explanation': 'Agent Alex Mercer is a key f... | {\n "title": "Paranormal Military Squad and... | 2025-03-04 | 9 |
基于社区报告构建全局上下文¶
In [6]
已复制!
context_builder = GlobalCommunityContext(
community_reports=reports,
communities=communities,
entities=entities, # default to None if you don't want to use community weights for ranking
token_encoder=token_encoder,
)
context_builder = GlobalCommunityContext( community_reports=reports, communities=communities, entities=entities, # 如果您不想使用社区权重进行排名,则默认为 None token_encoder=token_encoder, )
执行全局搜索¶
In [7]
已复制!
context_builder_params = {
"use_community_summary": False, # False means using full community reports. True means using community short summaries.
"shuffle_data": True,
"include_community_rank": True,
"min_community_rank": 0,
"community_rank_name": "rank",
"include_community_weight": True,
"community_weight_name": "occurrence weight",
"normalize_community_weight": True,
"max_tokens": 12_000, # change this based on the token limit you have on your model (if you are using a model with 8k limit, a good setting could be 5000)
"context_name": "Reports",
}
map_llm_params = {
"max_tokens": 1000,
"temperature": 0.0,
"response_format": {"type": "json_object"},
}
reduce_llm_params = {
"max_tokens": 2000, # change this based on the token limit you have on your model (if you are using a model with 8k limit, a good setting could be 1000-1500)
"temperature": 0.0,
}
context_builder_params = { "use_community_summary": False, # False 表示使用完整的社区报告。True 表示使用社区简短摘要。 "shuffle_data": True, "include_community_rank": True, "min_community_rank": 0, "community_rank_name": "rank", "include_community_weight": True, "community_weight_name": "occurrence weight", "normalize_community_weight": True, "max_tokens": 12_000, # 根据您的模型上的 token 限制更改此设置(如果您使用的是具有 8k 限制的模型,一个好的设置可能是 5000) "context_name": "Reports", } map_llm_params = { "max_tokens": 1000, "temperature": 0.0, "response_format": {"type": "json_object"}, } reduce_llm_params = { "max_tokens": 2000, # 根据您的模型上的 token 限制更改此设置(如果您使用的是具有 8k 限制的模型,一个好的设置可能是 1000-1500) "temperature": 0.0, }
In [8]
已复制!
search_engine = GlobalSearch(
model=model,
context_builder=context_builder,
token_encoder=token_encoder,
max_data_tokens=12_000, # change this based on the token limit you have on your model (if you are using a model with 8k limit, a good setting could be 5000)
map_llm_params=map_llm_params,
reduce_llm_params=reduce_llm_params,
allow_general_knowledge=False, # set this to True will add instruction to encourage the LLM to incorporate general knowledge in the response, which may increase hallucinations, but could be useful in some use cases.
json_mode=True, # set this to False if your LLM model does not support JSON mode.
context_builder_params=context_builder_params,
concurrent_coroutines=32,
response_type="multiple paragraphs", # free form text describing the response type and format, can be anything, e.g. prioritized list, single paragraph, multiple paragraphs, multiple-page report
)
search_engine = GlobalSearch( model=model, context_builder=context_builder, token_encoder=token_encoder, max_data_tokens=12_000, # 根据您的模型上的 token 限制更改此设置(如果您使用的是具有 8k 限制的模型,一个好的设置可能是 5000) map_llm_params=map_llm_params, reduce_llm_params=reduce_llm_params, allow_general_knowledge=False, # 将此设置为 True 将添加指令以鼓励 LLM 在响应中纳入一般知识,这可能会增加幻觉,但在某些用例中可能很有用。 json_mode=True, # 如果您的 LLM 模型不支持 JSON 模式,请将其设置为 False。 context_builder_params=context_builder_params, concurrent_coroutines=32, response_type="multiple paragraphs", # 描述响应类型和格式的自由文本,可以是任何内容,例如,优先列表、单个段落、多个段落、多页报告 )
In [9]
已复制!
result = await search_engine.search("What is operation dulce?")
print(result.response)
result = await search_engine.search("什么是 operation dulce?") print(result.response)
Exception in _map_response_single_batch Traceback (most recent call last): File "/home/runner/work/graphrag/graphrag/graphrag/query/structured_search/global_search/search.py", line 227, in _map_response_single_batch model_response = await self.model.achat( ^^^^^^^^^^^^^^^^^^^^^^^ File "/home/runner/work/graphrag/graphrag/graphrag/language_model/providers/fnllm/models.py", line 84, in achat response = await self.model(prompt, history=history, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/runner/.cache/pypoetry/virtualenvs/graphrag-F2jvqev7-py3.11/lib/python3.11/site-packages/fnllm/openai/llm/openai_chat_llm.py", line 94, in __call__ return await self._text_chat_llm(prompt, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/runner/.cache/pypoetry/virtualenvs/graphrag-F2jvqev7-py3.11/lib/python3.11/site-packages/fnllm/openai/services/openai_tools_parsing.py", line 130, in __call__ return await self._delegate(prompt, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/runner/.cache/pypoetry/virtualenvs/graphrag-F2jvqev7-py3.11/lib/python3.11/site-packages/fnllm/base/base_llm.py", line 144, in __call__ return await self._decorated_target(prompt, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/runner/.cache/pypoetry/virtualenvs/graphrag-F2jvqev7-py3.11/lib/python3.11/site-packages/fnllm/base/services/json.py", line 77, in invoke return await this.invoke_json(delegate, prompt, kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/runner/.cache/pypoetry/virtualenvs/graphrag-F2jvqev7-py3.11/lib/python3.11/site-packages/fnllm/base/services/json.py", line 96, in invoke_json return await self.try_receive_json(delegate, prompt, kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/runner/.cache/pypoetry/virtualenvs/graphrag-F2jvqev7-py3.11/lib/python3.11/site-packages/fnllm/base/services/json.py", line 162, in try_receive_json result = await delegate(prompt, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/runner/.cache/pypoetry/virtualenvs/graphrag-F2jvqev7-py3.11/lib/python3.11/site-packages/fnllm/base/services/rate_limiter.py", line 75, in invoke result = await delegate(prompt, **args) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/runner/.cache/pypoetry/virtualenvs/graphrag-F2jvqev7-py3.11/lib/python3.11/site-packages/fnllm/base/base_llm.py", line 126, in _decorator_target output = await self._execute_llm(prompt, kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/runner/.cache/pypoetry/virtualenvs/graphrag-F2jvqev7-py3.11/lib/python3.11/site-packages/fnllm/openai/llm/openai_text_chat_llm.py", line 157, in _execute_llm completion = await self._client.chat.completions.create( ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/runner/.cache/pypoetry/virtualenvs/graphrag-F2jvqev7-py3.11/lib/python3.11/site-packages/openai/resources/chat/completions/completions.py", line 2000, in create return await self._post( ^^^^^^^^^^^^^^^^^ File "/home/runner/.cache/pypoetry/virtualenvs/graphrag-F2jvqev7-py3.11/lib/python3.11/site-packages/openai/_base_client.py", line 1767, in post return await self.request(cast_to, opts, stream=stream, stream_cls=stream_cls) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/runner/.cache/pypoetry/virtualenvs/graphrag-F2jvqev7-py3.11/lib/python3.11/site-packages/openai/_base_client.py", line 1461, in request return await self._request( ^^^^^^^^^^^^^^^^^^^^ File "/home/runner/.cache/pypoetry/virtualenvs/graphrag-F2jvqev7-py3.11/lib/python3.11/site-packages/openai/_base_client.py", line 1562, in _request raise self._make_status_error_from_response(err.response) from None openai.AuthenticationError: Error code: 401 - {'error': {'message': 'Incorrect API key provided: sk-proj-********************************************************************************************************************************************************zWYA. You can find your API key at https://platform.openai.com/account/api-keys.', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_api_key'}}
Warning: All map responses have score 0 (i.e., no relevant information found from the dataset), returning a canned 'I do not know' answer. You can try enabling `allow_general_knowledge` to encourage the LLM to incorporate relevant general knowledge, at the risk of increasing hallucinations.
I am sorry but I am unable to answer this question given the provided data.
In [10]
已复制!
# inspect the data used to build the context for the LLM responses
result.context_data["reports"]
# 检查用于构建 LLM 响应上下文的数据 result.context_data["reports"]
Out[10]
id | title | occurrence weight | content | rank | |
---|---|---|---|---|---|
0 | 1 | Paranormal Military Squad and Operation: Dulce | 1.0 | # Paranormal Military Squad and Operation: Dul... | 8.5 |
1 | 0 | Operation: Dulce and Dulce Base Exploration | 1.0 | # Operation: Dulce and Dulce Base Exploration\... | 8.5 |
In [11]
已复制!
# inspect number of LLM calls and tokens
print(
f"LLM calls: {result.llm_calls}. Prompt tokens: {result.prompt_tokens}. Output tokens: {result.output_tokens}."
)
# 检查 LLM 调用和令牌的数量 print( f"LLM 调用: {result.llm_calls}. 提示令牌: {result.prompt_tokens}. 输出令牌: {result.output_tokens}." )
LLM calls: 1. Prompt tokens: 2401. Output tokens: 0.