Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
105 changes: 38 additions & 67 deletions docs/examples_notebooks/index_migration_to_v3.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "code",
"execution_count": 4,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -18,45 +18,40 @@
"\n",
"This notebook is used to maintain data model parity with older indexes for version 3.0 of GraphRAG. If you have a pre-3.0 index and need to migrate without re-running the entire pipeline, you can use this notebook to only update the pieces necessary for alignment. If you have a pre-2.0 index, please run the v2 migration notebook first!\n",
"\n",
"NOTE: we recommend regenerating your settings.yml with the latest version of GraphRAG using `graphrag init`. Copy your LLM settings into it before running this notebook. This ensures your config is aligned with the latest version for the migration.\n",
"\n",
"This notebook will also update your settings.yaml to ensure compatibility with our newer vector store collection naming scheme in order to avoid re-ingesting.\n",
"NOTE: we recommend regenerating your settings.yml with the latest version of GraphRAG using `graphrag init`. Copy your LLM settings into it before running this notebook. This ensures your config is aligned with the latest version for the migration. The config changes from v2 to v3 are significant in places!\n",
"\n",
"WARNING: This will overwrite your parquet files, you may want to make a backup!"
]
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# This is the directory that has your settings.yaml\n",
"PROJECT_DIRECTORY = \"/Users/naevans/graphrag/working/migration\""
"PROJECT_DIRECTORY = \"<your project directory>\""
]
},
{
"cell_type": "code",
"execution_count": 15,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from pathlib import Path\n",
"\n",
"from graphrag.config.load_config import load_config\n",
"from graphrag.storage.factory import StorageFactory\n",
"from graphrag.config.models.graph_rag_config import GraphRagConfig\n",
"from graphrag_common.config import load_config\n",
"from graphrag_storage.storage_factory import create_storage\n",
"\n",
"config = load_config(Path(PROJECT_DIRECTORY))\n",
"storage_config = config.output.model_dump()\n",
"storage = StorageFactory().create_storage(\n",
" storage_type=storage_config[\"type\"],\n",
" kwargs=storage_config,\n",
")"
"config = load_config(GraphRagConfig, config_path=Path(PROJECT_DIRECTORY))\n",
"storage = create_storage(config.output_storage)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -67,7 +62,7 @@
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -89,71 +84,47 @@
"metadata": {},
"source": [
"## Update settings.yaml\n",
"This next section will attempt to insert index names for each vector index using our new schema structure. It depends on most things being default. If you have already customized your vector store schema it may not be necessary.\n",
"\n",
"The primary goal is to align v2 indexes using our old default naming schema with the new customizability. If don't need this done or you have a more complicated config, comment it out and update your config manually to ensure each index name is set.\n",
"If you have left the default settings for your vector store schema, you may need to set explicit values that map each embedding type to a vector schema name. If you have already customized your vector store schema it may not be necessary.\n",
"\n",
"Old default index names:\n",
"- default-text_unit-text\n",
"- default-entity-description\n",
"- default-community-full_content\n",
"\n",
"(if you left all of the defaults, check your output/lancedb folder to confirm the above)\n",
"\n",
"v3 versions are:\n",
"- text_unit_text\n",
"- entity_description\n",
"- community_full_content\n",
"\n",
"Therefore, with a v2 index we will explicitly set the old index names so it connects correctly.\n",
"\n",
"NOTE: we are also setting the default vector_size for each index, under the assumption that you are using a prior default with 1536 dimensions. Our new default of text-embedding-3-large has 3072 dimensions, which will be populated as the default if unset. Again, if you have a more complicated situation you may want to manually configure this.\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import yaml\n",
"\n",
"EMBEDDING_DIMENSIONS = 1536\n",
"\n",
"settings = Path(PROJECT_DIRECTORY) / \"settings.yaml\"\n",
"with Path.open(settings) as f:\n",
" conf = yaml.safe_load(f)\n",
"\n",
"vector_store = conf.get(\"vector_store\", {})\n",
"container_name = vector_store.get(\"container_name\", \"default\")\n",
"embeddings_schema = vector_store.get(\"embeddings_schema\", {})\n",
"text_unit_schema = embeddings_schema.get(\"text_unit.text\", {})\n",
"if \"index_name\" not in text_unit_schema:\n",
" text_unit_schema[\"index_name\"] = f\"{container_name}-text_unit-text\"\n",
"if \"vector_size\" not in text_unit_schema:\n",
" text_unit_schema[\"vector_size\"] = EMBEDDING_DIMENSIONS\n",
"embeddings_schema[\"text_unit.text\"] = text_unit_schema\n",
"entity_schema = embeddings_schema.get(\"entity.description\", {})\n",
"if \"index_name\" not in entity_schema:\n",
" entity_schema[\"index_name\"] = f\"{container_name}-entity-description\"\n",
"if \"vector_size\" not in entity_schema:\n",
" entity_schema[\"vector_size\"] = EMBEDDING_DIMENSIONS\n",
"embeddings_schema[\"entity.description\"] = entity_schema\n",
"community_schema = embeddings_schema.get(\"community.full_content\", {})\n",
"if \"index_name\" not in community_schema:\n",
" community_schema[\"index_name\"] = f\"{container_name}-community-full_content\"\n",
"if \"vector_size\" not in community_schema:\n",
" community_schema[\"vector_size\"] = EMBEDDING_DIMENSIONS\n",
"embeddings_schema[\"community.full_content\"] = community_schema\n",
"vector_store[\"embeddings_schema\"] = embeddings_schema\n",
"conf[\"vector_store\"] = vector_store\n",
"\n",
"with Path.open(settings, \"w\") as f:\n",
" yaml.safe_dump(conf, f)"
"Therefore, with a v2 index need to explicitly set the old index names so it connects correctly. We no longer support the \"prefix\" - you can just set an explicit index_name for each embedding.\n",
"\n",
"NOTE: we are also setting the default vector_size for each index below, under the assumption that you are using a prior default with 1536 dimensions. Our new default of text-embedding-3-large has 3072 dimensions, which will be populated as the default if unset. Again, if you have a more complicated situation you may want to manually configure this.\n",
"\n",
"Here is an example of the new vector store config block that you may need in your settings.yaml:\n",
"\n",
"```yaml\n",
"vector_store:\n",
" type: lancedb\n",
" db_uri: output/lancedb\n",
" index_schema:\n",
" text_unit_text:\n",
" index_name: default-text_unit-text\n",
" vector_size: 1536\n",
" entity_description:\n",
" index_name: default-entity-description\n",
" vector_size: 1536\n",
" community_full_content:\n",
" index_name: default-community-full_content\n",
" vector_size: 1536\n",
"```\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "graphrag",
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
Expand All @@ -167,7 +138,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.10"
"version": "3.12.3"
}
},
"nbformat": 4,
Expand Down
3 changes: 3 additions & 0 deletions packages/graphrag-llm/graphrag_llm/config/model_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

"""Language model configuration."""

import logging
from typing import Any

from pydantic import BaseModel, ConfigDict, Field, model_validator
Expand All @@ -12,6 +13,8 @@
from graphrag_llm.config.retry_config import RetryConfig
from graphrag_llm.config.types import AuthMethod, LLMProviderType

logger = logging.getLogger(__name__)


class ModelConfig(BaseModel):
"""Configuration for a language model."""
Expand Down
10 changes: 5 additions & 5 deletions packages/graphrag/graphrag/cli/query.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def run_global_search(
"""
cli_overrides: dict[str, Any] = {}
if data_dir:
cli_overrides["output"] = {"base_dir": str(data_dir)}
cli_overrides["output_storage"] = {"base_dir": str(data_dir)}
config = load_config(
root_dir=root_dir,
cli_overrides=cli_overrides,
Expand Down Expand Up @@ -124,7 +124,7 @@ def run_local_search(
"""
cli_overrides: dict[str, Any] = {}
if data_dir:
cli_overrides["output"] = {"base_dir": str(data_dir)}
cli_overrides["output_storage"] = {"base_dir": str(data_dir)}
config = load_config(
root_dir=root_dir,
cli_overrides=cli_overrides,
Expand Down Expand Up @@ -221,7 +221,7 @@ def run_drift_search(
"""
cli_overrides: dict[str, Any] = {}
if data_dir:
cli_overrides["output"] = {"base_dir": str(data_dir)}
cli_overrides["output_storage"] = {"base_dir": str(data_dir)}
config = load_config(
root_dir=root_dir,
cli_overrides=cli_overrides,
Expand Down Expand Up @@ -312,7 +312,7 @@ def run_basic_search(
"""
cli_overrides: dict[str, Any] = {}
if data_dir:
cli_overrides["output"] = {"base_dir": str(data_dir)}
cli_overrides["output_storage"] = {"base_dir": str(data_dir)}
config = load_config(
root_dir=root_dir,
cli_overrides=cli_overrides,
Expand Down Expand Up @@ -377,7 +377,7 @@ def _resolve_output_files(
) -> dict[str, Any]:
"""Read indexing output files to a dataframe dict."""
dataframe_dict = {}
storage_obj = create_storage(config.output)
storage_obj = create_storage(config.output_storage)
for name in output_list:
df_value = asyncio.run(load_table_from_storage(name=name, storage=storage_obj))
dataframe_dict[name] = df_value
Expand Down
10 changes: 6 additions & 4 deletions packages/graphrag/graphrag/config/models/graph_rag_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ def _validate_input_base_dir(self) -> None:
)
"""The chunking configuration to use."""

output: StorageConfig = Field(
output_storage: StorageConfig = Field(
description="The output configuration.",
default=StorageConfig(
base_dir=graphrag_config_defaults.output_storage.base_dir,
Expand All @@ -112,11 +112,13 @@ def _validate_input_base_dir(self) -> None:

def _validate_output_base_dir(self) -> None:
"""Validate the output base directory."""
if self.output.type == StorageType.File:
if not self.output.base_dir:
if self.output_storage.type == StorageType.File:
if not self.output_storage.base_dir:
msg = "output base directory is required for file output. Please rerun `graphrag init` and set the output configuration."
raise ValueError(msg)
self.output.base_dir = str(Path(self.output.base_dir).resolve())
self.output_storage.base_dir = str(
Path(self.output_storage.base_dir).resolve()
)

update_output_storage: StorageConfig = Field(
description="The output configuration for the updated index.",
Expand Down
2 changes: 1 addition & 1 deletion packages/graphrag/graphrag/index/run/run_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ async def run_pipeline(
) -> AsyncIterable[PipelineRunResult]:
"""Run all workflows using a simplified pipeline."""
input_storage = create_storage(config.input_storage)
output_storage = create_storage(config.output)
output_storage = create_storage(config.output_storage)
cache = create_cache(config.cache)

# load existing state in case any workflows are stateful
Expand Down
2 changes: 1 addition & 1 deletion packages/graphrag/graphrag/index/run/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ def get_update_storages(
config: GraphRagConfig, timestamp: str
) -> tuple[Storage, Storage, Storage]:
"""Get storage objects for the update index run."""
output_storage = create_storage(config.output)
output_storage = create_storage(config.output_storage)
update_storage = create_storage(config.update_output_storage)
timestamped_storage = update_storage.child(timestamp)
delta_storage = timestamped_storage.child("delta")
Expand Down
4 changes: 2 additions & 2 deletions tests/unit/config/test_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,11 +49,11 @@ def test_load_config_with_cli_overrides() -> None:
output_dir = "some_output_dir"
expected_output_base_dir = root_dir / output_dir
expected = get_default_graphrag_config()
expected.output.base_dir = str(expected_output_base_dir)
expected.output_storage.base_dir = str(expected_output_base_dir)

actual = load_config(
root_dir=root_dir,
cli_overrides={"output": {"base_dir": output_dir}},
cli_overrides={"output_storage": {"base_dir": output_dir}},
)
assert_graphrag_configs(actual, expected)
# Need to reset cwd after test
Expand Down
2 changes: 1 addition & 1 deletion tests/unit/config/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -351,7 +351,7 @@ def assert_graphrag_configs(actual: GraphRagConfig, expected: GraphRagConfig) ->

assert_vector_store_configs(actual.vector_store, expected.vector_store)
assert_reporting_configs(actual.reporting, expected.reporting)
assert_storage_config(actual.output, expected.output)
assert_storage_config(actual.output_storage, expected.output_storage)
assert_storage_config(actual.input_storage, expected.input_storage)
assert_storage_config(actual.update_output_storage, expected.update_output_storage)

Expand Down