本地搜索

在 [1] 中

已复制!

# Copyright (c) 2024 Microsoft Corporation.
# Licensed under the MIT License.
# 版权所有 (c) 2024 Microsoft Corporation。# 根据 MIT 许可证获得许可。

在 [2] 中

已复制!





import os

import pandas as pd
import tiktoken

from graphrag.query.context_builder.entity_extraction import EntityVectorStoreKey
from graphrag.query.indexer_adapters import (
    read_indexer_covariates,
    read_indexer_entities,
    read_indexer_relationships,
    read_indexer_reports,
    read_indexer_text_units,
)
from graphrag.query.question_gen.local_gen import LocalQuestionGen
from graphrag.query.structured_search.local_search.mixed_context import (
    LocalSearchMixedContext,
)
from graphrag.query.structured_search.local_search.search import LocalSearch
from graphrag.vector_stores.lancedb import LanceDBVectorStore
import os import pandas as pd import tiktoken from graphrag.query.context_builder.entity_extraction import EntityVectorStoreKey from graphrag.query.indexer_adapters import ( read_indexer_covariates, read_indexer_entities, read_indexer_relationships, read_indexer_reports, read_indexer_text_units, ) from graphrag.query.question_gen.local_gen import LocalQuestionGen from graphrag.query.structured_search.local_search.mixed_context import ( LocalSearchMixedContext, ) from graphrag.query.structured_search.local_search.search import LocalSearch from graphrag.vector_stores.lancedb import LanceDBVectorStore

本地搜索示例¶

本地搜索方法通过将来自 AI 提取的知识图谱的相关数据与原始文档的文本块相结合来生成答案。此方法适用于需要理解文档中提到的特定实体的问题（例如，洋甘菊的治疗特性是什么？）。

加载文本单元和图数据表作为本地搜索的上下文¶

在此测试中，我们首先将索引输出从 parquet 文件加载到数据帧，然后将这些数据帧转换为与知识模型对齐的数据对象集合。

加载表到数据帧¶

在 [3] 中

已复制!





INPUT_DIR = "./inputs/operation dulce"
LANCEDB_URI = f"{INPUT_DIR}/lancedb"

COMMUNITY_REPORT_TABLE = "community_reports"
ENTITY_TABLE = "entities"
COMMUNITY_TABLE = "communities"
RELATIONSHIP_TABLE = "relationships"
COVARIATE_TABLE = "covariates"
TEXT_UNIT_TABLE = "text_units"
COMMUNITY_LEVEL = 2
INPUT_DIR = "./inputs/operation dulce" LANCEDB_URI = f"{INPUT_DIR}/lancedb" COMMUNITY_REPORT_TABLE = "community_reports" ENTITY_TABLE = "entities" COMMUNITY_TABLE = "communities" RELATIONSHIP_TABLE = "relationships" COVARIATE_TABLE = "covariates" TEXT_UNIT_TABLE = "text_units" COMMUNITY_LEVEL = 2

读取实体¶

在 [4] 中

已复制!





# read nodes table to get community and degree data
entity_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_TABLE}.parquet")
community_df = pd.read_parquet(f"{INPUT_DIR}/{COMMUNITY_TABLE}.parquet")

entities = read_indexer_entities(entity_df, community_df, COMMUNITY_LEVEL)

# load description embeddings to an in-memory lancedb vectorstore
# to connect to a remote db, specify url and port values.
description_embedding_store = LanceDBVectorStore(
    collection_name="default-entity-description",
)
description_embedding_store.connect(db_uri=LANCEDB_URI)

print(f"Entity count: {len(entity_df)}")
entity_df.head()
# 读取节点表以获取社区和度数据 entity_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_TABLE}.parquet") community_df = pd.read_parquet(f"{INPUT_DIR}/{COMMUNITY_TABLE}.parquet") entities = read_indexer_entities(entity_df, community_df, COMMUNITY_LEVEL) # 将描述嵌入加载到内存中的 lancedb 向量存储 # 要连接到远程数据库，请指定 url 和端口值。 description_embedding_store = LanceDBVectorStore( collection_name="default-entity-description", ) description_embedding_store.connect(db_uri=LANCEDB_URI) print(f"实体数量：{len(entity_df)}") entity_df.head()

Entity count: 18

Out[4]

	id	human_readable_id	title	type	description	text_unit_ids	frequency	degree
0	425a7862-0aef-4f69-a4c8-8bd42151c9d4	0	ALEX MERCER	PERSON	Agent Alex Mercer is a determined individual w...	[8e938693af886bfd081acbbe8384c3671446bff84a134...	4	9
1	bcdbf1fc-0dc1-460f-bc71-2781729c96ba	1	TAYLOR CRUZ	PERSON	Agent Taylor Cruz is a commanding and authorit...	[8e938693af886bfd081acbbe8384c3671446bff84a134...	4	8
2	ef02ef24-5762-46ce-93ce-7dea6fc86595	2	JORDAN HAYES	PERSON	Dr. Jordan Hayes is a scientist and a member o...	[8e938693af886bfd081acbbe8384c3671446bff84a134...	4	9
3	8b163d27-e43a-4a2c-a26f-866778d8720e	3	SAM RIVERA	PERSON	Sam Rivera is a cybersecurity expert and a tal...	[8e938693af886bfd081acbbe8384c3671446bff84a134...	4	8
4	542aa5bd-ba2d-400a-8488-c52d50bc300d	4	PARANORMAL MILITARY SQUAD	ORGANIZATION	The PARANORMAL MILITARY SQUAD is an elite grou...	[8e938693af886bfd081acbbe8384c3671446bff84a134...	2	6

读取关系¶

在 [5] 中

已复制!

relationship_df = pd.read_parquet(f"{INPUT_DIR}/{RELATIONSHIP_TABLE}.parquet")
relationships = read_indexer_relationships(relationship_df)

print(f"Relationship count: {len(relationship_df)}")
relationship_df.head()
relationship_df = pd.read_parquet(f"{INPUT_DIR}/{RELATIONSHIP_TABLE}.parquet") relationships = read_indexer_relationships(relationship_df) print(f"关系数量：{len(relationship_df)}") relationship_df.head()

Relationship count: 54

Out[5]

	id	human_readable_id	source	target	description	weight	combined_degree	text_unit_ids
0	2bfad9f4-5abd-48d0-8db3-a9cad9120413	0	ALEX MERCER	TAYLOR CRUZ	Alex Mercer and Taylor Cruz are both agents wo...	37.0	17	[8e938693af886bfd081acbbe8384c3671446bff84a134...
1	6cbb838f-9e83-4086-a684-15c8ed709e52	1	ALEX MERCER	JORDAN HAYES	Alex Mercer and Jordan Hayes are both agents w...	42.0	18	[8e938693af886bfd081acbbe8384c3671446bff84a134...
2	bfdc25f1-80ca-477b-a304-94465b69e680	2	ALEX MERCER	SAM RIVERA	Alex Mercer and Sam Rivera are both agents and...	26.0	17	[8e938693af886bfd081acbbe8384c3671446bff84a134...
3	7a7e943d-a4f5-487b-9625-5d0907c4c26d	3	ALEX MERCER	PARANORMAL MILITARY SQUAD	Alex Mercer is a member of the Paranormal Mili...	17.0	15	[8e938693af886bfd081acbbe8384c3671446bff84a134...
4	5e00bcb9-a17e-4c27-8241-6ebb286a7fc6	4	ALEX MERCER	DULCE	Alex Mercer is preparing to lead the team into...	15.0	14	[8e938693af886bfd081acbbe8384c3671446bff84a134...

在 [6] 中

已复制!

# NOTE: covariates are turned off by default, because they generally need prompt tuning to be valuable
# Please see the GRAPHRAG_CLAIM_* settings
covariate_df = pd.read_parquet(f"{INPUT_DIR}/{COVARIATE_TABLE}.parquet")

claims = read_indexer_covariates(covariate_df)

print(f"Claim records: {len(claims)}")
covariates = {"claims": claims}
# 注意：协变量默认是关闭的，因为它们通常需要提示调优才能有价值 # 请查看 GRAPHRAG_CLAIM_* 设置 covariate_df = pd.read_parquet(f"{INPUT_DIR}/{COVARIATE_TABLE}.parquet") claims = read_indexer_covariates(covariate_df) print(f"Claim records: {len(claims)}") covariates = {"claims": claims}

Claim records: 17

读取社区报告¶

在 [7] 中

已复制!

report_df = pd.read_parquet(f"{INPUT_DIR}/{COMMUNITY_REPORT_TABLE}.parquet")
reports = read_indexer_reports(report_df, community_df, COMMUNITY_LEVEL)

print(f"Report records: {len(report_df)}")
report_df.head()
report_df = pd.read_parquet(f"{INPUT_DIR}/{COMMUNITY_REPORT_TABLE}.parquet") reports = read_indexer_reports(report_df, community_df, COMMUNITY_LEVEL) print(f"Report records: {len(report_df)}") report_df.head()

Report records: 2

Out[7]

	id	human_readable_id	community	level	parent	children	title	summary	full_content	rank	rating_explanation	findings	full_content_json	period	size
0	6c3a555680d647ac8be866a129c7b0ea	0	0	0	-1	[]	Operation: Dulce and Dulce Base Exploration	The community revolves around 'Operation: Dulc...	# Operation: Dulce and Dulce Base Exploration\...	8.5	The impact severity rating is high due to the ...	[{'explanation': 'Operation: Dulce is a signif...	{\n "title": "Operation: Dulce and Dulce Ba...	2025-03-04	7
1	0127331a1ea34b8ba19de2c2a4cb3bc9	1	1	0	-1	[]	Paranormal Military Squad and Operation: Dulce	The community centers around the Paranormal Mi...	# Paranormal Military Squad and Operation: Dul...	8.5	The impact severity rating is high due to the ...	[{'explanation': 'Agent Alex Mercer is a key f...	{\n "title": "Paranormal Military Squad and...	2025-03-04	9

读取文本单元¶

在 [8] 中

已复制!

text_unit_df = pd.read_parquet(f"{INPUT_DIR}/{TEXT_UNIT_TABLE}.parquet")
text_units = read_indexer_text_units(text_unit_df)

print(f"Text unit records: {len(text_unit_df)}")
text_unit_df.head()
text_unit_df = pd.read_parquet(f"{INPUT_DIR}/{TEXT_UNIT_TABLE}.parquet") text_units = read_indexer_text_units(text_unit_df) print(f"文本单元记录：{len(text_unit_df)}") text_unit_df.head()

Text unit records: 5

Out[8]

	id	human_readable_id	text	n_tokens	document_ids	entity_ids	relationship_ids	covariate_ids
0	8e938693af886bfd081acbbe8384c3671446bff84a134a...	1	# Operation: Dulce\n\n## Chapter 1\n\nThe thru...	1200	[6e81f882f89dd5596e1925dd3ae8a4f0a0edcb55b35a8...	[425a7862-0aef-4f69-a4c8-8bd42151c9d4, bcdbf1f...	[2bfad9f4-5abd-48d0-8db3-a9cad9120413, 6cbb838...	[745d28dd-be20-411b-85ff-1c69ca70e7b3, 9cba185...
1	fd1f46d32e1df6cd429542aeda3d64ddf3745ccb80f443...	2	, the hollow echo of the bay a stark reminder ...	1200	[6e81f882f89dd5596e1925dd3ae8a4f0a0edcb55b35a8...	[425a7862-0aef-4f69-a4c8-8bd42151c9d4, bcdbf1f...	[2bfad9f4-5abd-48d0-8db3-a9cad9120413, 6cbb838...	[4f9b461f-5e8f-465d-9586-e2fc81787062, 0f74618...
2	7296d9a1f046854d59079dc183de8a054c27c4843d2979...	3	differently than praise from others. This was...	1200	[6e81f882f89dd5596e1925dd3ae8a4f0a0edcb55b35a8...	[425a7862-0aef-4f69-a4c8-8bd42151c9d4, bcdbf1f...	[2bfad9f4-5abd-48d0-8db3-a9cad9120413, 6cbb838...	[3ef1be9c-4080-4fac-99bd-c4a636248904, 8730b20...
3	ac72722a02ac71242a2a91fca323198d04197daf60515d...	4	contrast to the rigid silence enveloping the ...	1200	[6e81f882f89dd5596e1925dd3ae8a4f0a0edcb55b35a8...	[425a7862-0aef-4f69-a4c8-8bd42151c9d4, bcdbf1f...	[2bfad9f4-5abd-48d0-8db3-a9cad9120413, 6cbb838...	[2c292047-b79a-4958-ab57-7bf7d7a22c92, 3cbd18a...
4	4c277337d461a16aaf8f9760ddb8b44ef220e948a2341d...	5	a mask of duty.\n\nIn the midst of the descen...	35	[6e81f882f89dd5596e1925dd3ae8a4f0a0edcb55b35a8...	[d084d615-3584-4ec8-9931-90aa6075c764, 4b84859...	[6efdc42e-69a2-47c0-97ec-4b296cd16d5e]	[db8da02f-f889-4bb5-8e81-ab2a72e380bb]

在 [9] 中

已复制!





from graphrag.config.enums import ModelType
from graphrag.config.models.language_model_config import LanguageModelConfig
from graphrag.language_model.manager import ModelManager

api_key = os.environ["GRAPHRAG_API_KEY"]
llm_model = os.environ["GRAPHRAG_LLM_MODEL"]
embedding_model = os.environ["GRAPHRAG_EMBEDDING_MODEL"]

chat_config = LanguageModelConfig(
    api_key=api_key,
    type=ModelType.OpenAIChat,
    model=llm_model,
    max_retries=20,
)
chat_model = ModelManager().get_or_create_chat_model(
    name="local_search",
    model_type=ModelType.OpenAIChat,
    config=chat_config,
)

token_encoder = tiktoken.encoding_for_model(llm_model)

embedding_config = LanguageModelConfig(
    api_key=api_key,
    type=ModelType.OpenAIEmbedding,
    model=embedding_model,
    max_retries=20,
)

text_embedder = ModelManager().get_or_create_embedding_model(
    name="local_search_embedding",
    model_type=ModelType.OpenAIEmbedding,
    config=embedding_config,
)
from graphrag.config.enums import ModelType from graphrag.config.models.language_model_config import LanguageModelConfig from graphrag.language_model.manager import ModelManager api_key = os.environ["GRAPHRAG_API_KEY"] llm_model = os.environ["GRAPHRAG_LLM_MODEL"] embedding_model = os.environ["GRAPHRAG_EMBEDDING_MODEL"] chat_config = LanguageModelConfig( api_key=api_key, type=ModelType.OpenAIChat, model=llm_model, max_retries=20, ) chat_model = ModelManager().get_or_create_chat_model( name="local_search", model_type=ModelType.OpenAIChat, config=chat_config, ) token_encoder = tiktoken.encoding_for_model(llm_model) embedding_config = LanguageModelConfig( api_key=api_key, type=ModelType.OpenAIEmbedding, model=embedding_model, max_retries=20, ) text_embedder = ModelManager().get_or_create_embedding_model( name="local_search_embedding", model_type=ModelType.OpenAIEmbedding, config=embedding_config, )

创建本地搜索上下文构建器¶

在 [10] 中

已复制!





context_builder = LocalSearchMixedContext(
    community_reports=reports,
    text_units=text_units,
    entities=entities,
    relationships=relationships,
    # if you did not run covariates during indexing, set this to None
    covariates=covariates,
    entity_text_embeddings=description_embedding_store,
    embedding_vectorstore_key=EntityVectorStoreKey.ID,  # if the vectorstore uses entity title as ids, set this to EntityVectorStoreKey.TITLE
    text_embedder=text_embedder,
    token_encoder=token_encoder,
)
context_builder = LocalSearchMixedContext( community_reports=reports, text_units=text_units, entities=entities, relationships=relationships, # 如果您在索引期间未运行协变量，请将其设置为 None covariates=covariates, entity_text_embeddings=description_embedding_store, embedding_vectorstore_key=EntityVectorStoreKey.ID, # 如果向量存储使用实体标题作为 id，请将其设置为 EntityVectorStoreKey.TITLE text_embedder=text_embedder, token_encoder=token_encoder, )

创建本地搜索引擎¶

在 [11] 中

已复制!





# text_unit_prop: proportion of context window dedicated to related text units
# community_prop: proportion of context window dedicated to community reports.
# The remaining proportion is dedicated to entities and relationships. Sum of text_unit_prop and community_prop should be <= 1
# conversation_history_max_turns: maximum number of turns to include in the conversation history.
# conversation_history_user_turns_only: if True, only include user queries in the conversation history.
# top_k_mapped_entities: number of related entities to retrieve from the entity description embedding store.
# top_k_relationships: control the number of out-of-network relationships to pull into the context window.
# include_entity_rank: if True, include the entity rank in the entity table in the context window. Default entity rank = node degree.
# include_relationship_weight: if True, include the relationship weight in the context window.
# include_community_rank: if True, include the community rank in the context window.
# return_candidate_context: if True, return a set of dataframes containing all candidate entity/relationship/covariate records that
# could be relevant. Note that not all of these records will be included in the context window. The "in_context" column in these
# dataframes indicates whether the record is included in the context window.
# max_tokens: maximum number of tokens to use for the context window.


local_context_params = {
    "text_unit_prop": 0.5,
    "community_prop": 0.1,
    "conversation_history_max_turns": 5,
    "conversation_history_user_turns_only": True,
    "top_k_mapped_entities": 10,
    "top_k_relationships": 10,
    "include_entity_rank": True,
    "include_relationship_weight": True,
    "include_community_rank": False,
    "return_candidate_context": False,
    "embedding_vectorstore_key": EntityVectorStoreKey.ID,  # set this to EntityVectorStoreKey.TITLE if the vectorstore uses entity title as ids
    "max_tokens": 12_000,  # change this based on the token limit you have on your model (if you are using a model with 8k limit, a good setting could be 5000)
}

model_params = {
    "max_tokens": 2_000,  # change this based on the token limit you have on your model (if you are using a model with 8k limit, a good setting could be 1000=1500)
    "temperature": 0.0,
}
# text_unit_prop: 专用于相关文本单元的上下文窗口的比例 # community_prop: 专用于社区报告的上下文窗口的比例。 # 剩余比例专用于实体和关系。 text_unit_prop 和 community_prop 的总和应 <= 1 # conversation_history_max_turns: 对话历史记录中包含的最大轮数。 # conversation_history_user_turns_only: 如果为 True，则仅在对话历史记录中包含用户查询。 # top_k_mapped_entities: 从实体描述嵌入存储中检索的相关实体数量。 # top_k_relationships: 控制拉入上下文窗口的非网络关系的數量。 # include_entity_rank: 如果为 True，则在上下文窗口的实体表中包含实体排名。 默认实体排名 = 节点度。 # include_relationship_weight: 如果为 True，则在上下文窗口中包含关系权重。 # include_community_rank: 如果为 True，则在上下文窗口中包含社区排名。 # return_candidate_context: 如果为 True，则返回一组包含所有候选实体/关系/协变量记录的数据帧，这些记录 # 可能相关。 请注意，并非所有这些记录都将包含在上下文窗口中。 这些 # 数据帧中的 "in_context" 列指示记录是否包含在上下文窗口中。 # max_tokens: 上下文窗口使用的最大 token 数量。 local_context_params = { "text_unit_prop": 0.5, "community_prop": 0.1, "conversation_history_max_turns": 5, "conversation_history_user_turns_only": True, "top_k_mapped_entities": 10, "top_k_relationships": 10, "include_entity_rank": True, "include_relationship_weight": True, "include_community_rank": False, "return_candidate_context": False, "embedding_vectorstore_key": EntityVectorStoreKey.ID, # 如果向量存储使用实体标题作为 ID，则将其设置为 EntityVectorStoreKey.TITLE "max_tokens": 12_000, # 根据模型上的 token 限制更改此值（如果您使用的模型具有 8k 限制，则一个好的设置可以是 5000） } model_params = { "max_tokens": 2_000, # 根据模型上的 token 限制更改此值（如果您使用的模型具有 8k 限制，则一个好的设置可以是 1000=1500） "temperature": 0.0, }

在 [12] 中

已复制!





search_engine = LocalSearch(
    model=chat_model,
    context_builder=context_builder,
    token_encoder=token_encoder,
    model_params=model_params,
    context_builder_params=local_context_params,
    response_type="multiple paragraphs",  # free form text describing the response type and format, can be anything, e.g. prioritized list, single paragraph, multiple paragraphs, multiple-page report
)
search_engine = LocalSearch( model=chat_model, context_builder=context_builder, token_encoder=token_encoder, model_params=model_params, context_builder_params=local_context_params, response_type="multiple paragraphs", # 描述响应类型和格式的自由文本，可以是任何内容，例如：优先级列表、单个段落、多个段落、多页报告 )

在示例查询上运行本地搜索¶

在 [13] 中

已复制!

result = await search_engine.search("Tell me about Agent Mercer")
print(result.response)
result = await search_engine.search("告诉我关于 Agent Mercer 的信息") print(result.response)

---------------------------------------------------------------------------
AuthenticationError                       Traceback (most recent call last)
Cell In[13], line 1
----> 1 result = await search_engine.search("Tell me about Agent Mercer")
      2 print(result.response)

File ~/work/graphrag/graphrag/graphrag/query/structured_search/local_search/search.py:63, in LocalSearch.search(self, query, conversation_history, **kwargs)
     61 search_prompt = ""
     62 llm_calls, prompt_tokens, output_tokens = {}, {}, {}
---> 63 context_result = self.context_builder.build_context(
     64     query=query,
     65     conversation_history=conversation_history,
     66     **kwargs,
     67     **self.context_builder_params,
     68 )
     69 llm_calls["build_context"] = context_result.llm_calls
     70 prompt_tokens["build_context"] = context_result.prompt_tokens

File ~/work/graphrag/graphrag/graphrag/query/structured_search/local_search/mixed_context.py:139, in LocalSearchMixedContext.build_context(self, query, conversation_history, include_entity_names, exclude_entity_names, conversation_history_max_turns, conversation_history_user_turns_only, max_context_tokens, text_unit_prop, community_prop, top_k_mapped_entities, top_k_relationships, include_community_rank, include_entity_rank, rank_description, include_relationship_weight, relationship_ranking_attribute, return_candidate_context, use_community_summary, min_community_rank, community_context_name, column_delimiter, **kwargs)
    134     pre_user_questions = "\n".join(
    135         conversation_history.get_user_turns(conversation_history_max_turns)
    136     )
    137     query = f"{query}\n{pre_user_questions}"
--> 139 selected_entities = map_query_to_entities(
    140     query=query,
    141     text_embedding_vectorstore=self.entity_text_embeddings,
    142     text_embedder=self.text_embedder,
    143     all_entities_dict=self.entities,
    144     embedding_vectorstore_key=self.embedding_vectorstore_key,
    145     include_entity_names=include_entity_names,
    146     exclude_entity_names=exclude_entity_names,
    147     k=top_k_mapped_entities,
    148     oversample_scaler=2,
    149 )
    151 # build context
    152 final_context = list[str]()

File ~/work/graphrag/graphrag/graphrag/query/context_builder/entity_extraction.py:58, in map_query_to_entities(query, text_embedding_vectorstore, text_embedder, all_entities_dict, embedding_vectorstore_key, include_entity_names, exclude_entity_names, k, oversample_scaler)
     54 matched_entities = []
     55 if query != "":
     56     # get entities with highest semantic similarity to query
     57     # oversample to account for excluded entities
---> 58     search_results = text_embedding_vectorstore.similarity_search_by_text(
     59         text=query,
     60         text_embedder=lambda t: text_embedder.embed(t),
     61         k=k * oversample_scaler,
     62     )
     63     for result in search_results:
     64         if embedding_vectorstore_key == EntityVectorStoreKey.ID and isinstance(
     65             result.document.id, str
     66         ):

File ~/work/graphrag/graphrag/graphrag/vector_stores/lancedb.py:134, in LanceDBVectorStore.similarity_search_by_text(self, text, text_embedder, k, **kwargs)
    130 def similarity_search_by_text(
    131     self, text: str, text_embedder: TextEmbedder, k: int = 10, **kwargs: Any
    132 ) -> list[VectorStoreSearchResult]:
    133     """Perform a similarity search using a given input text."""
--> 134     query_embedding = text_embedder(text)
    135     if query_embedding:
    136         return self.similarity_search_by_vector(query_embedding, k)

File ~/work/graphrag/graphrag/graphrag/query/context_builder/entity_extraction.py:60, in map_query_to_entities.<locals>.<lambda>(t)
     54 matched_entities = []
     55 if query != "":
     56     # get entities with highest semantic similarity to query
     57     # oversample to account for excluded entities
     58     search_results = text_embedding_vectorstore.similarity_search_by_text(
     59         text=query,
---> 60         text_embedder=lambda t: text_embedder.embed(t),
     61         k=k * oversample_scaler,
     62     )
     63     for result in search_results:
     64         if embedding_vectorstore_key == EntityVectorStoreKey.ID and isinstance(
     65             result.document.id, str
     66         ):

File ~/work/graphrag/graphrag/graphrag/language_model/providers/fnllm/models.py:237, in OpenAIEmbeddingFNLLM.embed(self, text, **kwargs)
    225 def embed(self, text: str, **kwargs) -> list[float]:
    226     """
    227     Embed the given text using the Model.
    228 
   (...)
    235         The embeddings of the text.
    236     """
--> 237     return run_coroutine_sync(self.aembed(text, **kwargs))

File ~/work/graphrag/graphrag/graphrag/language_model/providers/fnllm/utils.py:127, in run_coroutine_sync(coroutine)
    125     _thr.start()
    126 future = asyncio.run_coroutine_threadsafe(coroutine, _loop)
--> 127 return future.result()

File /opt/hostedtoolcache/Python/3.11.12/x64/lib/python3.11/concurrent/futures/_base.py:456, in Future.result(self, timeout)
    454     raise CancelledError()
    455 elif self._state == FINISHED:
--> 456     return self.__get_result()
    457 else:
    458     raise TimeoutError()

File /opt/hostedtoolcache/Python/3.11.12/x64/lib/python3.11/concurrent/futures/_base.py:401, in Future.__get_result(self)
    399 if self._exception:
    400     try:
--> 401         raise self._exception
    402     finally:
    403         # Break a reference cycle with the exception in self._exception
    404         self = None

File ~/work/graphrag/graphrag/graphrag/language_model/providers/fnllm/models.py:204, in OpenAIEmbeddingFNLLM.aembed(self, text, **kwargs)
    192 async def aembed(self, text: str, **kwargs) -> list[float]:
    193     """
    194     Embed the given text using the Model.
    195 
   (...)
    202         The embeddings of the text.
    203     """
--> 204     response = await self.model([text], **kwargs)
    205     if response.output.embeddings is None:
    206         msg = "No embeddings found in response"

File ~/.cache/pypoetry/virtualenvs/graphrag-F2jvqev7-py3.11/lib/python3.11/site-packages/fnllm/base/base_llm.py:144, in BaseLLM.__call__(self, prompt, **kwargs)
    142 try:
    143     prompt, kwargs = self._rewrite_input(prompt, kwargs)
--> 144     return await self._decorated_target(prompt, **kwargs)
    145 except BaseException as e:
    146     stack_trace = traceback.format_exc()

File ~/.cache/pypoetry/virtualenvs/graphrag-F2jvqev7-py3.11/lib/python3.11/site-packages/fnllm/base/services/rate_limiter.py:75, in RateLimiter.decorate.<locals>.invoke(prompt, **args)
     73     async with self._limiter.use(manifest):
     74         await self._events.on_limit_acquired(manifest)
---> 75         result = await delegate(prompt, **args)
     76 finally:
     77     await self._events.on_limit_released(manifest)

File ~/.cache/pypoetry/virtualenvs/graphrag-F2jvqev7-py3.11/lib/python3.11/site-packages/fnllm/base/base_llm.py:126, in BaseLLM._decorator_target(self, prompt, **kwargs)
    121 """Target for the decorator chain.
    122 
    123 Leave signature alone as prompt,  kwargs.
    124 """
    125 await self._events.on_execute_llm()
--> 126 output = await self._execute_llm(prompt, kwargs)
    127 result = LLMOutput(output=output)
    128 await self._inject_usage(result)

File ~/.cache/pypoetry/virtualenvs/graphrag-F2jvqev7-py3.11/lib/python3.11/site-packages/fnllm/openai/llm/openai_embeddings_llm.py:126, in OpenAIEmbeddingsLLMImpl._execute_llm(self, prompt, kwargs)
    121 local_model_parameters = kwargs.get("model_parameters")
    122 embeddings_parameters = self._build_embeddings_parameters(
    123     local_model_parameters
    124 )
--> 126 result = await self._client.embeddings.create(
    127     input=prompt,
    128     **embeddings_parameters,
    129 )
    130 usage: LLMUsageMetrics | None = None
    131 if result.usage:

File ~/.cache/pypoetry/virtualenvs/graphrag-F2jvqev7-py3.11/lib/python3.11/site-packages/openai/resources/embeddings.py:243, in AsyncEmbeddings.create(self, input, model, dimensions, encoding_format, user, extra_headers, extra_query, extra_body, timeout)
    237             embedding.embedding = np.frombuffer(  # type: ignore[no-untyped-call]
    238                 base64.b64decode(data), dtype="float32"
    239             ).tolist()
    241     return obj
--> 243 return await self._post(
    244     "/embeddings",
    245     body=maybe_transform(params, embedding_create_params.EmbeddingCreateParams),
    246     options=make_request_options(
    247         extra_headers=extra_headers,
    248         extra_query=extra_query,
    249         extra_body=extra_body,
    250         timeout=timeout,
    251         post_parser=parser,
    252     ),
    253     cast_to=CreateEmbeddingResponse,
    254 )

File ~/.cache/pypoetry/virtualenvs/graphrag-F2jvqev7-py3.11/lib/python3.11/site-packages/openai/_base_client.py:1767, in AsyncAPIClient.post(self, path, cast_to, body, files, options, stream, stream_cls)
   1753 async def post(
   1754     self,
   1755     path: str,
   (...)
   1762     stream_cls: type[_AsyncStreamT] | None = None,
   1763 ) -> ResponseT | _AsyncStreamT:
   1764     opts = FinalRequestOptions.construct(
   1765         method="post", url=path, json_data=body, files=await async_to_httpx_files(files), **options
   1766     )
-> 1767     return await self.request(cast_to, opts, stream=stream, stream_cls=stream_cls)

File ~/.cache/pypoetry/virtualenvs/graphrag-F2jvqev7-py3.11/lib/python3.11/site-packages/openai/_base_client.py:1461, in AsyncAPIClient.request(self, cast_to, options, stream, stream_cls, remaining_retries)
   1458 else:
   1459     retries_taken = 0
-> 1461 return await self._request(
   1462     cast_to=cast_to,
   1463     options=options,
   1464     stream=stream,
   1465     stream_cls=stream_cls,
   1466     retries_taken=retries_taken,
   1467 )

File ~/.cache/pypoetry/virtualenvs/graphrag-F2jvqev7-py3.11/lib/python3.11/site-packages/openai/_base_client.py:1562, in AsyncAPIClient._request(self, cast_to, options, stream, stream_cls, retries_taken)
   1559         await err.response.aread()
   1561     log.debug("Re-raising status error")
-> 1562     raise self._make_status_error_from_response(err.response) from None
   1564 return await self._process_response(
   1565     cast_to=cast_to,
   1566     options=options,
   (...)
   1570     retries_taken=retries_taken,
   1571 )

AuthenticationError: Error code: 401 - {'error': {'message': 'Incorrect API key provided: sk-proj-********************************************************************************************************************************************************zWYA. You can find your API key at https://platform.openai.com/account/api-keys.', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_api_key'}}

在 [14] 中

已复制!

question = "Tell me about Dr. Jordan Hayes"
result = await search_engine.search(question)
print(result.response)
question = "告诉我关于 Dr. Jordan Hayes 的信息" result = await search_engine.search(question) print(result.response)

---------------------------------------------------------------------------
AuthenticationError                       Traceback (most recent call last)
Cell In[14], line 2
      1 question = "Tell me about Dr. Jordan Hayes"
----> 2 result = await search_engine.search(question)
      3 print(result.response)

File ~/work/graphrag/graphrag/graphrag/query/structured_search/local_search/search.py:63, in LocalSearch.search(self, query, conversation_history, **kwargs)
     61 search_prompt = ""
     62 llm_calls, prompt_tokens, output_tokens = {}, {}, {}
---> 63 context_result = self.context_builder.build_context(
     64     query=query,
     65     conversation_history=conversation_history,
     66     **kwargs,
     67     **self.context_builder_params,
     68 )
     69 llm_calls["build_context"] = context_result.llm_calls
     70 prompt_tokens["build_context"] = context_result.prompt_tokens

File ~/work/graphrag/graphrag/graphrag/query/structured_search/local_search/mixed_context.py:139, in LocalSearchMixedContext.build_context(self, query, conversation_history, include_entity_names, exclude_entity_names, conversation_history_max_turns, conversation_history_user_turns_only, max_context_tokens, text_unit_prop, community_prop, top_k_mapped_entities, top_k_relationships, include_community_rank, include_entity_rank, rank_description, include_relationship_weight, relationship_ranking_attribute, return_candidate_context, use_community_summary, min_community_rank, community_context_name, column_delimiter, **kwargs)
    134     pre_user_questions = "\n".join(
    135         conversation_history.get_user_turns(conversation_history_max_turns)
    136     )
    137     query = f"{query}\n{pre_user_questions}"
--> 139 selected_entities = map_query_to_entities(
    140     query=query,
    141     text_embedding_vectorstore=self.entity_text_embeddings,
    142     text_embedder=self.text_embedder,
    143     all_entities_dict=self.entities,
    144     embedding_vectorstore_key=self.embedding_vectorstore_key,
    145     include_entity_names=include_entity_names,
    146     exclude_entity_names=exclude_entity_names,
    147     k=top_k_mapped_entities,
    148     oversample_scaler=2,
    149 )
    151 # build context
    152 final_context = list[str]()

File ~/work/graphrag/graphrag/graphrag/query/context_builder/entity_extraction.py:58, in map_query_to_entities(query, text_embedding_vectorstore, text_embedder, all_entities_dict, embedding_vectorstore_key, include_entity_names, exclude_entity_names, k, oversample_scaler)
     54 matched_entities = []
     55 if query != "":
     56     # get entities with highest semantic similarity to query
     57     # oversample to account for excluded entities
---> 58     search_results = text_embedding_vectorstore.similarity_search_by_text(
     59         text=query,
     60         text_embedder=lambda t: text_embedder.embed(t),
     61         k=k * oversample_scaler,
     62     )
     63     for result in search_results:
     64         if embedding_vectorstore_key == EntityVectorStoreKey.ID and isinstance(
     65             result.document.id, str
     66         ):

File ~/work/graphrag/graphrag/graphrag/vector_stores/lancedb.py:134, in LanceDBVectorStore.similarity_search_by_text(self, text, text_embedder, k, **kwargs)
    130 def similarity_search_by_text(
    131     self, text: str, text_embedder: TextEmbedder, k: int = 10, **kwargs: Any
    132 ) -> list[VectorStoreSearchResult]:
    133     """Perform a similarity search using a given input text."""
--> 134     query_embedding = text_embedder(text)
    135     if query_embedding:
    136         return self.similarity_search_by_vector(query_embedding, k)

File ~/work/graphrag/graphrag/graphrag/query/context_builder/entity_extraction.py:60, in map_query_to_entities.<locals>.<lambda>(t)
     54 matched_entities = []
     55 if query != "":
     56     # get entities with highest semantic similarity to query
     57     # oversample to account for excluded entities
     58     search_results = text_embedding_vectorstore.similarity_search_by_text(
     59         text=query,
---> 60         text_embedder=lambda t: text_embedder.embed(t),
     61         k=k * oversample_scaler,
     62     )
     63     for result in search_results:
     64         if embedding_vectorstore_key == EntityVectorStoreKey.ID and isinstance(
     65             result.document.id, str
     66         ):

File ~/work/graphrag/graphrag/graphrag/language_model/providers/fnllm/models.py:237, in OpenAIEmbeddingFNLLM.embed(self, text, **kwargs)
    225 def embed(self, text: str, **kwargs) -> list[float]:
    226     """
    227     Embed the given text using the Model.
    228 
   (...)
    235         The embeddings of the text.
    236     """
--> 237     return run_coroutine_sync(self.aembed(text, **kwargs))

File ~/work/graphrag/graphrag/graphrag/language_model/providers/fnllm/utils.py:127, in run_coroutine_sync(coroutine)
    125     _thr.start()
    126 future = asyncio.run_coroutine_threadsafe(coroutine, _loop)
--> 127 return future.result()

File /opt/hostedtoolcache/Python/3.11.12/x64/lib/python3.11/concurrent/futures/_base.py:456, in Future.result(self, timeout)
    454     raise CancelledError()
    455 elif self._state == FINISHED:
--> 456     return self.__get_result()
    457 else:
    458     raise TimeoutError()

File /opt/hostedtoolcache/Python/3.11.12/x64/lib/python3.11/concurrent/futures/_base.py:401, in Future.__get_result(self)
    399 if self._exception:
    400     try:
--> 401         raise self._exception
    402     finally:
    403         # Break a reference cycle with the exception in self._exception
    404         self = None

File ~/work/graphrag/graphrag/graphrag/language_model/providers/fnllm/models.py:204, in OpenAIEmbeddingFNLLM.aembed(self, text, **kwargs)
    192 async def aembed(self, text: str, **kwargs) -> list[float]:
    193     """
    194     Embed the given text using the Model.
    195 
   (...)
    202         The embeddings of the text.
    203     """
--> 204     response = await self.model([text], **kwargs)
    205     if response.output.embeddings is None:
    206         msg = "No embeddings found in response"

File ~/.cache/pypoetry/virtualenvs/graphrag-F2jvqev7-py3.11/lib/python3.11/site-packages/fnllm/base/base_llm.py:144, in BaseLLM.__call__(self, prompt, **kwargs)
    142 try:
    143     prompt, kwargs = self._rewrite_input(prompt, kwargs)
--> 144     return await self._decorated_target(prompt, **kwargs)
    145 except BaseException as e:
    146     stack_trace = traceback.format_exc()

File ~/.cache/pypoetry/virtualenvs/graphrag-F2jvqev7-py3.11/lib/python3.11/site-packages/fnllm/base/services/rate_limiter.py:75, in RateLimiter.decorate.<locals>.invoke(prompt, **args)
     73     async with self._limiter.use(manifest):
     74         await self._events.on_limit_acquired(manifest)
---> 75         result = await delegate(prompt, **args)
     76 finally:
     77     await self._events.on_limit_released(manifest)

File ~/.cache/pypoetry/virtualenvs/graphrag-F2jvqev7-py3.11/lib/python3.11/site-packages/fnllm/base/base_llm.py:126, in BaseLLM._decorator_target(self, prompt, **kwargs)
    121 """Target for the decorator chain.
    122 
    123 Leave signature alone as prompt,  kwargs.
    124 """
    125 await self._events.on_execute_llm()
--> 126 output = await self._execute_llm(prompt, kwargs)
    127 result = LLMOutput(output=output)
    128 await self._inject_usage(result)

File ~/.cache/pypoetry/virtualenvs/graphrag-F2jvqev7-py3.11/lib/python3.11/site-packages/fnllm/openai/llm/openai_embeddings_llm.py:126, in OpenAIEmbeddingsLLMImpl._execute_llm(self, prompt, kwargs)
    121 local_model_parameters = kwargs.get("model_parameters")
    122 embeddings_parameters = self._build_embeddings_parameters(
    123     local_model_parameters
    124 )
--> 126 result = await self._client.embeddings.create(
    127     input=prompt,
    128     **embeddings_parameters,
    129 )
    130 usage: LLMUsageMetrics | None = None
    131 if result.usage:

File ~/.cache/pypoetry/virtualenvs/graphrag-F2jvqev7-py3.11/lib/python3.11/site-packages/openai/resources/embeddings.py:243, in AsyncEmbeddings.create(self, input, model, dimensions, encoding_format, user, extra_headers, extra_query, extra_body, timeout)
    237             embedding.embedding = np.frombuffer(  # type: ignore[no-untyped-call]
    238                 base64.b64decode(data), dtype="float32"
    239             ).tolist()
    241     return obj
--> 243 return await self._post(
    244     "/embeddings",
    245     body=maybe_transform(params, embedding_create_params.EmbeddingCreateParams),
    246     options=make_request_options(
    247         extra_headers=extra_headers,
    248         extra_query=extra_query,
    249         extra_body=extra_body,
    250         timeout=timeout,
    251         post_parser=parser,
    252     ),
    253     cast_to=CreateEmbeddingResponse,
    254 )

File ~/.cache/pypoetry/virtualenvs/graphrag-F2jvqev7-py3.11/lib/python3.11/site-packages/openai/_base_client.py:1767, in AsyncAPIClient.post(self, path, cast_to, body, files, options, stream, stream_cls)
   1753 async def post(
   1754     self,
   1755     path: str,
   (...)
   1762     stream_cls: type[_AsyncStreamT] | None = None,
   1763 ) -> ResponseT | _AsyncStreamT:
   1764     opts = FinalRequestOptions.construct(
   1765         method="post", url=path, json_data=body, files=await async_to_httpx_files(files), **options
   1766     )
-> 1767     return await self.request(cast_to, opts, stream=stream, stream_cls=stream_cls)

File ~/.cache/pypoetry/virtualenvs/graphrag-F2jvqev7-py3.11/lib/python3.11/site-packages/openai/_base_client.py:1461, in AsyncAPIClient.request(self, cast_to, options, stream, stream_cls, remaining_retries)
   1458 else:
   1459     retries_taken = 0
-> 1461 return await self._request(
   1462     cast_to=cast_to,
   1463     options=options,
   1464     stream=stream,
   1465     stream_cls=stream_cls,
   1466     retries_taken=retries_taken,
   1467 )

File ~/.cache/pypoetry/virtualenvs/graphrag-F2jvqev7-py3.11/lib/python3.11/site-packages/openai/_base_client.py:1562, in AsyncAPIClient._request(self, cast_to, options, stream, stream_cls, retries_taken)
   1559         await err.response.aread()
   1561     log.debug("Re-raising status error")
-> 1562     raise self._make_status_error_from_response(err.response) from None
   1564 return await self._process_response(
   1565     cast_to=cast_to,
   1566     options=options,
   (...)
   1570     retries_taken=retries_taken,
   1571 )

AuthenticationError: Error code: 401 - {'error': {'message': 'Incorrect API key provided: sk-proj-********************************************************************************************************************************************************zWYA. You can find your API key at https://platform.openai.com/account/api-keys.', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_api_key'}}

检查用于生成响应的上下文数据¶

在 [15] 中

已复制!

result.context_data["entities"].head()
result.context_data["entities"].head()

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[15], line 1
----> 1 result.context_data["entities"].head()

NameError: name 'result' is not defined

在 [16] 中

已复制!

result.context_data["relationships"].head()
result.context_data["relationships"].head()

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[16], line 1
----> 1 result.context_data["relationships"].head()

NameError: name 'result' is not defined

在 [17] 中

已复制!

if "reports" in result.context_data:
    result.context_data["reports"].head()
if "reports" in result.context_data: result.context_data["reports"].head()

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[17], line 1
----> 1 if "reports" in result.context_data:
      2     result.context_data["reports"].head()

NameError: name 'result' is not defined

在 [18] 中

已复制!

result.context_data["sources"].head()
result.context_data["sources"].head()

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[18], line 1
----> 1 result.context_data["sources"].head()

NameError: name 'result' is not defined

在 [19] 中

已复制!

if "claims" in result.context_data:
    print(result.context_data["claims"].head())
if "claims" in result.context_data: print(result.context_data["claims"].head())

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[19], line 1
----> 1 if "claims" in result.context_data:
      2     print(result.context_data["claims"].head())

NameError: name 'result' is not defined

问题生成¶

此函数接受用户查询列表并生成下一个候选问题。

在 [20] 中

已复制!





question_generator = LocalQuestionGen(
    model=chat_model,
    context_builder=context_builder,
    token_encoder=token_encoder,
    model_params=model_params,
    context_builder_params=local_context_params,
)
question_generator = LocalQuestionGen( model=chat_model, context_builder=context_builder, token_encoder=token_encoder, model_params=model_params, context_builder_params=local_context_params, )

在 [21] 中

已复制!





question_history = [
    "Tell me about Agent Mercer",
    "What happens in Dulce military base?",
]
candidate_questions = await question_generator.agenerate(
    question_history=question_history, context_data=None, question_count=5
)
print(candidate_questions.response)
question_history = [ "告诉我关于 Agent Mercer 的信息", "Dulce 军事基地发生了什么？", ] candidate_questions = await question_generator.agenerate( question_history=question_history, context_data=None, question_count=5 ) print(candidate_questions.response)

---------------------------------------------------------------------------
AuthenticationError                       Traceback (most recent call last)
Cell In[21], line 5
      1 question_history = [
      2     "Tell me about Agent Mercer",
      3     "What happens in Dulce military base?",
      4 ]
----> 5 candidate_questions = await question_generator.agenerate(
      6     question_history=question_history, context_data=None, question_count=5
      7 )
      8 print(candidate_questions.response)

File ~/work/graphrag/graphrag/graphrag/query/question_gen/local_gen.py:80, in LocalQuestionGen.agenerate(self, question_history, context_data, question_count, **kwargs)
     74     conversation_history = ConversationHistory.from_list(history)
     76 if context_data is None:
     77     # generate context data based on the question history
     78     result = cast(
     79         "ContextBuilderResult",
---> 80         self.context_builder.build_context(
     81             query=question_text,
     82             conversation_history=conversation_history,
     83             **kwargs,
     84             **self.context_builder_params,
     85         ),
     86     )
     87     context_data = cast("str", result.context_chunks)
     88     context_records = result.context_records

File ~/work/graphrag/graphrag/graphrag/query/structured_search/local_search/mixed_context.py:139, in LocalSearchMixedContext.build_context(self, query, conversation_history, include_entity_names, exclude_entity_names, conversation_history_max_turns, conversation_history_user_turns_only, max_context_tokens, text_unit_prop, community_prop, top_k_mapped_entities, top_k_relationships, include_community_rank, include_entity_rank, rank_description, include_relationship_weight, relationship_ranking_attribute, return_candidate_context, use_community_summary, min_community_rank, community_context_name, column_delimiter, **kwargs)
    134     pre_user_questions = "\n".join(
    135         conversation_history.get_user_turns(conversation_history_max_turns)
    136     )
    137     query = f"{query}\n{pre_user_questions}"
--> 139 selected_entities = map_query_to_entities(
    140     query=query,
    141     text_embedding_vectorstore=self.entity_text_embeddings,
    142     text_embedder=self.text_embedder,
    143     all_entities_dict=self.entities,
    144     embedding_vectorstore_key=self.embedding_vectorstore_key,
    145     include_entity_names=include_entity_names,
    146     exclude_entity_names=exclude_entity_names,
    147     k=top_k_mapped_entities,
    148     oversample_scaler=2,
    149 )
    151 # build context
    152 final_context = list[str]()

File ~/work/graphrag/graphrag/graphrag/query/context_builder/entity_extraction.py:58, in map_query_to_entities(query, text_embedding_vectorstore, text_embedder, all_entities_dict, embedding_vectorstore_key, include_entity_names, exclude_entity_names, k, oversample_scaler)
     54 matched_entities = []
     55 if query != "":
     56     # get entities with highest semantic similarity to query
     57     # oversample to account for excluded entities
---> 58     search_results = text_embedding_vectorstore.similarity_search_by_text(
     59         text=query,
     60         text_embedder=lambda t: text_embedder.embed(t),
     61         k=k * oversample_scaler,
     62     )
     63     for result in search_results:
     64         if embedding_vectorstore_key == EntityVectorStoreKey.ID and isinstance(
     65             result.document.id, str
     66         ):

File ~/work/graphrag/graphrag/graphrag/vector_stores/lancedb.py:134, in LanceDBVectorStore.similarity_search_by_text(self, text, text_embedder, k, **kwargs)
    130 def similarity_search_by_text(
    131     self, text: str, text_embedder: TextEmbedder, k: int = 10, **kwargs: Any
    132 ) -> list[VectorStoreSearchResult]:
    133     """Perform a similarity search using a given input text."""
--> 134     query_embedding = text_embedder(text)
    135     if query_embedding:
    136         return self.similarity_search_by_vector(query_embedding, k)

File ~/work/graphrag/graphrag/graphrag/query/context_builder/entity_extraction.py:60, in map_query_to_entities.<locals>.<lambda>(t)
     54 matched_entities = []
     55 if query != "":
     56     # get entities with highest semantic similarity to query
     57     # oversample to account for excluded entities
     58     search_results = text_embedding_vectorstore.similarity_search_by_text(
     59         text=query,
---> 60         text_embedder=lambda t: text_embedder.embed(t),
     61         k=k * oversample_scaler,
     62     )
     63     for result in search_results:
     64         if embedding_vectorstore_key == EntityVectorStoreKey.ID and isinstance(
     65             result.document.id, str
     66         ):

File ~/work/graphrag/graphrag/graphrag/language_model/providers/fnllm/models.py:237, in OpenAIEmbeddingFNLLM.embed(self, text, **kwargs)
    225 def embed(self, text: str, **kwargs) -> list[float]:
    226     """
    227     Embed the given text using the Model.
    228 
   (...)
    235         The embeddings of the text.
    236     """
--> 237     return run_coroutine_sync(self.aembed(text, **kwargs))

File ~/work/graphrag/graphrag/graphrag/language_model/providers/fnllm/utils.py:127, in run_coroutine_sync(coroutine)
    125     _thr.start()
    126 future = asyncio.run_coroutine_threadsafe(coroutine, _loop)
--> 127 return future.result()

File /opt/hostedtoolcache/Python/3.11.12/x64/lib/python3.11/concurrent/futures/_base.py:456, in Future.result(self, timeout)
    454     raise CancelledError()
    455 elif self._state == FINISHED:
--> 456     return self.__get_result()
    457 else:
    458     raise TimeoutError()

File /opt/hostedtoolcache/Python/3.11.12/x64/lib/python3.11/concurrent/futures/_base.py:401, in Future.__get_result(self)
    399 if self._exception:
    400     try:
--> 401         raise self._exception
    402     finally:
    403         # Break a reference cycle with the exception in self._exception
    404         self = None

File ~/work/graphrag/graphrag/graphrag/language_model/providers/fnllm/models.py:204, in OpenAIEmbeddingFNLLM.aembed(self, text, **kwargs)
    192 async def aembed(self, text: str, **kwargs) -> list[float]:
    193     """
    194     Embed the given text using the Model.
    195 
   (...)
    202         The embeddings of the text.
    203     """
--> 204     response = await self.model([text], **kwargs)
    205     if response.output.embeddings is None:
    206         msg = "No embeddings found in response"

File ~/.cache/pypoetry/virtualenvs/graphrag-F2jvqev7-py3.11/lib/python3.11/site-packages/fnllm/base/base_llm.py:144, in BaseLLM.__call__(self, prompt, **kwargs)
    142 try:
    143     prompt, kwargs = self._rewrite_input(prompt, kwargs)
--> 144     return await self._decorated_target(prompt, **kwargs)
    145 except BaseException as e:
    146     stack_trace = traceback.format_exc()

File ~/.cache/pypoetry/virtualenvs/graphrag-F2jvqev7-py3.11/lib/python3.11/site-packages/fnllm/base/services/rate_limiter.py:75, in RateLimiter.decorate.<locals>.invoke(prompt, **args)
     73     async with self._limiter.use(manifest):
     74         await self._events.on_limit_acquired(manifest)
---> 75         result = await delegate(prompt, **args)
     76 finally:
     77     await self._events.on_limit_released(manifest)

File ~/.cache/pypoetry/virtualenvs/graphrag-F2jvqev7-py3.11/lib/python3.11/site-packages/fnllm/base/base_llm.py:126, in BaseLLM._decorator_target(self, prompt, **kwargs)
    121 """Target for the decorator chain.
    122 
    123 Leave signature alone as prompt,  kwargs.
    124 """
    125 await self._events.on_execute_llm()
--> 126 output = await self._execute_llm(prompt, kwargs)
    127 result = LLMOutput(output=output)
    128 await self._inject_usage(result)

File ~/.cache/pypoetry/virtualenvs/graphrag-F2jvqev7-py3.11/lib/python3.11/site-packages/fnllm/openai/llm/openai_embeddings_llm.py:126, in OpenAIEmbeddingsLLMImpl._execute_llm(self, prompt, kwargs)
    121 local_model_parameters = kwargs.get("model_parameters")
    122 embeddings_parameters = self._build_embeddings_parameters(
    123     local_model_parameters
    124 )
--> 126 result = await self._client.embeddings.create(
    127     input=prompt,
    128     **embeddings_parameters,
    129 )
    130 usage: LLMUsageMetrics | None = None
    131 if result.usage:

File ~/.cache/pypoetry/virtualenvs/graphrag-F2jvqev7-py3.11/lib/python3.11/site-packages/openai/resources/embeddings.py:243, in AsyncEmbeddings.create(self, input, model, dimensions, encoding_format, user, extra_headers, extra_query, extra_body, timeout)
    237             embedding.embedding = np.frombuffer(  # type: ignore[no-untyped-call]
    238                 base64.b64decode(data), dtype="float32"
    239             ).tolist()
    241     return obj
--> 243 return await self._post(
    244     "/embeddings",
    245     body=maybe_transform(params, embedding_create_params.EmbeddingCreateParams),
    246     options=make_request_options(
    247         extra_headers=extra_headers,
    248         extra_query=extra_query,
    249         extra_body=extra_body,
    250         timeout=timeout,
    251         post_parser=parser,
    252     ),
    253     cast_to=CreateEmbeddingResponse,
    254 )

File ~/.cache/pypoetry/virtualenvs/graphrag-F2jvqev7-py3.11/lib/python3.11/site-packages/openai/_base_client.py:1767, in AsyncAPIClient.post(self, path, cast_to, body, files, options, stream, stream_cls)
   1753 async def post(
   1754     self,
   1755     path: str,
   (...)
   1762     stream_cls: type[_AsyncStreamT] | None = None,
   1763 ) -> ResponseT | _AsyncStreamT:
   1764     opts = FinalRequestOptions.construct(
   1765         method="post", url=path, json_data=body, files=await async_to_httpx_files(files), **options
   1766     )
-> 1767     return await self.request(cast_to, opts, stream=stream, stream_cls=stream_cls)

File ~/.cache/pypoetry/virtualenvs/graphrag-F2jvqev7-py3.11/lib/python3.11/site-packages/openai/_base_client.py:1461, in AsyncAPIClient.request(self, cast_to, options, stream, stream_cls, remaining_retries)
   1458 else:
   1459     retries_taken = 0
-> 1461 return await self._request(
   1462     cast_to=cast_to,
   1463     options=options,
   1464     stream=stream,
   1465     stream_cls=stream_cls,
   1466     retries_taken=retries_taken,
   1467 )

File ~/.cache/pypoetry/virtualenvs/graphrag-F2jvqev7-py3.11/lib/python3.11/site-packages/openai/_base_client.py:1562, in AsyncAPIClient._request(self, cast_to, options, stream, stream_cls, retries_taken)
   1559         await err.response.aread()
   1561     log.debug("Re-raising status error")
-> 1562     raise self._make_status_error_from_response(err.response) from None
   1564 return await self._process_response(
   1565     cast_to=cast_to,
   1566     options=options,
   (...)
   1570     retries_taken=retries_taken,
   1571 )

AuthenticationError: Error code: 401 - {'error': {'message': 'Incorrect API key provided: sk-proj-********************************************************************************************************************************************************zWYA. You can find your API key at https://platform.openai.com/account/api-keys.', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_api_key'}}