API 概览
In [1]
已复制!
# Copyright (c) 2024 Microsoft Corporation.
# Licensed under the MIT License.
# Copyright (c) 2024 Microsoft Corporation. # Licensed under the MIT License.
API 概览¶
本 notebook 演示了如何使用 API 以库的方式与 graphrag 交互,而不是通过 CLI。 请注意,graphrag 的 CLI 实际上通过此 API 连接到库以执行所有操作。
In [2]
已复制!
from pathlib import Path
from pprint import pprint
import pandas as pd
import graphrag.api as api
from graphrag.config.load_config import load_config
from graphrag.index.typing.pipeline_run_result import PipelineRunResult
from pathlib import Path from pprint import pprint import pandas as pd import graphrag.api as api from graphrag.config.load_config import load_config from graphrag.index.typing.pipeline_run_result import PipelineRunResult
In [3]
已复制!
PROJECT_DIRECTORY = "<your project directory>"
PROJECT_DIRECTORY = ""
生成一个 GraphRagConfig
对象¶
In [4]
已复制!
graphrag_config = load_config(Path(PROJECT_DIRECTORY))
graphrag_config = load_config(Path(PROJECT_DIRECTORY))
--------------------------------------------------------------------------- FileNotFoundError Traceback (most recent call last) Cell In[4], line 1 ----> 1 graphrag_config = load_config(Path(PROJECT_DIRECTORY)) File ~/work/graphrag/graphrag/graphrag/config/load_config.py:183, in load_config(root_dir, config_filepath, cli_overrides) 151 """Load configuration from a file. 152 153 Parameters (...) 180 If there are pydantic validation errors when instantiating the config. 181 """ 182 root = root_dir.resolve() --> 183 config_path = _get_config_path(root, config_filepath) 184 _load_dotenv(config_path) 185 config_extension = config_path.suffix File ~/work/graphrag/graphrag/graphrag/config/load_config.py:106, in _get_config_path(root_dir, config_filepath) 104 raise FileNotFoundError(msg) 105 else: --> 106 config_path = _search_for_config_in_root_dir(root_dir) 108 if not config_path: 109 msg = f"Config file not found in root directory: {root_dir}" File ~/work/graphrag/graphrag/graphrag/config/load_config.py:40, in _search_for_config_in_root_dir(root) 38 if not root.is_dir(): 39 msg = f"Invalid config path: {root} is not a directory" ---> 40 raise FileNotFoundError(msg) 42 for file in _default_config_files: 43 if (root / file).is_file(): FileNotFoundError: Invalid config path: /home/runner/work/graphrag/graphrag/docs/examples_notebooks/<your project directory> is not a directory
索引 API¶
索引是指摄取原始文本数据并构建知识图的过程。 GraphRAG 目前支持纯文本 (.txt
) 和 .csv
文件格式。
构建索引¶
In [5]
已复制!
index_result: list[PipelineRunResult] = await api.build_index(config=graphrag_config)
# index_result is a list of workflows that make up the indexing pipeline that was run
for workflow_result in index_result:
status = f"error\n{workflow_result.errors}" if workflow_result.errors else "success"
print(f"Workflow Name: {workflow_result.workflow}\tStatus: {status}")
index_result: list[PipelineRunResult] = await api.build_index(config=graphrag_config) # index_result 是组成为索引运行的管道的一系列工作流 for workflow_result in index_result: status = f"error\n{workflow_result.errors}" if workflow_result.errors else "success" print(f"Workflow Name: {workflow_result.workflow}\tStatus: {status}")
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[5], line 1 ----> 1 index_result: list[PipelineRunResult] = await api.build_index(config=graphrag_config) 3 # index_result is a list of workflows that make up the indexing pipeline that was run 4 for workflow_result in index_result: NameError: name 'graphrag_config' is not defined
查询索引¶
要查询索引,必须首先将多个索引文件读入内存并传递给查询 API。
In [6]
已复制!
entities = pd.read_parquet(f"{PROJECT_DIRECTORY}/output/entities.parquet")
communities = pd.read_parquet(f"{PROJECT_DIRECTORY}/output/communities.parquet")
community_reports = pd.read_parquet(
f"{PROJECT_DIRECTORY}/output/community_reports.parquet"
)
response, context = await api.global_search(
config=graphrag_config,
entities=entities,
communities=communities,
community_reports=community_reports,
community_level=2,
dynamic_community_selection=False,
response_type="Multiple Paragraphs",
query="Who is Scrooge and what are his main relationships?",
)
entities = pd.read_parquet(f"{PROJECT_DIRECTORY}/output/entities.parquet") communities = pd.read_parquet(f"{PROJECT_DIRECTORY}/output/communities.parquet") community_reports = pd.read_parquet( f"{PROJECT_DIRECTORY}/output/community_reports.parquet" ) response, context = await api.global_search( config=graphrag_config, entities=entities, communities=communities, community_reports=community_reports, community_level=2, dynamic_community_selection=False, response_type="Multiple Paragraphs", query="Who is Scrooge and what are his main relationships?", )
--------------------------------------------------------------------------- FileNotFoundError Traceback (most recent call last) Cell In[6], line 1 ----> 1 entities = pd.read_parquet(f"{PROJECT_DIRECTORY}/output/entities.parquet") 2 communities = pd.read_parquet(f"{PROJECT_DIRECTORY}/output/communities.parquet") 3 community_reports = pd.read_parquet( 4 f"{PROJECT_DIRECTORY}/output/community_reports.parquet" 5 ) File ~/.cache/pypoetry/virtualenvs/graphrag-F2jvqev7-py3.11/lib/python3.11/site-packages/pandas/io/parquet.py:667, in read_parquet(path, engine, columns, storage_options, use_nullable_dtypes, dtype_backend, filesystem, filters, **kwargs) 664 use_nullable_dtypes = False 665 check_dtype_backend(dtype_backend) --> 667 return impl.read( 668 path, 669 columns=columns, 670 filters=filters, 671 storage_options=storage_options, 672 use_nullable_dtypes=use_nullable_dtypes, 673 dtype_backend=dtype_backend, 674 filesystem=filesystem, 675 **kwargs, 676 ) File ~/.cache/pypoetry/virtualenvs/graphrag-F2jvqev7-py3.11/lib/python3.11/site-packages/pandas/io/parquet.py:267, in PyArrowImpl.read(self, path, columns, filters, use_nullable_dtypes, dtype_backend, storage_options, filesystem, **kwargs) 264 if manager == "array": 265 to_pandas_kwargs["split_blocks"] = True # type: ignore[assignment] --> 267 path_or_handle, handles, filesystem = _get_path_or_handle( 268 path, 269 filesystem, 270 storage_options=storage_options, 271 mode="rb", 272 ) 273 try: 274 pa_table = self.api.parquet.read_table( 275 path_or_handle, 276 columns=columns, (...) 279 **kwargs, 280 ) File ~/.cache/pypoetry/virtualenvs/graphrag-F2jvqev7-py3.11/lib/python3.11/site-packages/pandas/io/parquet.py:140, in _get_path_or_handle(path, fs, storage_options, mode, is_dir) 130 handles = None 131 if ( 132 not fs 133 and not is_dir (...) 138 # fsspec resources can also point to directories 139 # this branch is used for example when reading from non-fsspec URLs --> 140 handles = get_handle( 141 path_or_handle, mode, is_text=False, storage_options=storage_options 142 ) 143 fs = None 144 path_or_handle = handles.handle File ~/.cache/pypoetry/virtualenvs/graphrag-F2jvqev7-py3.11/lib/python3.11/site-packages/pandas/io/common.py:882, in get_handle(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options) 873 handle = open( 874 handle, 875 ioargs.mode, (...) 878 newline="", 879 ) 880 else: 881 # Binary mode --> 882 handle = open(handle, ioargs.mode) 883 handles.append(handle) 885 # Convert BytesIO or file objects passed with an encoding FileNotFoundError: [Errno 2] No such file or directory: '<your project directory>/output/entities.parquet'
response 对象是来自 graphrag 的官方响应,而 context 对象包含有关用于获得最终响应的查询过程的各种元数据。
In [7]
已复制!
print(response)
print(response)
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[7], line 1 ----> 1 print(response) NameError: name 'response' is not defined
深入研究上下文,可以为用户提供极其细致的信息,例如哪些数据源(细化到文本块级别)最终被检索并用作发送到 LLM 模型的上下文的一部分)。
In [8]
已复制!
pprint(context) # noqa: T203
pprint(context) # noqa: T203
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[8], line 1 ----> 1 pprint(context) # noqa: T203 NameError: name 'context' is not defined