diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index e164f396..4617123d 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -1 +1,2 @@ * @josh-chamberlain +* @maxachis diff --git a/alembic/versions/2026_02_27_1215-3a9f8e1d5b2c_migrate_batch_strategy_to_lookup_table.py b/alembic/versions/2026_02_27_1215-3a9f8e1d5b2c_migrate_batch_strategy_to_lookup_table.py new file mode 100644 index 00000000..4c9e99ed --- /dev/null +++ b/alembic/versions/2026_02_27_1215-3a9f8e1d5b2c_migrate_batch_strategy_to_lookup_table.py @@ -0,0 +1,96 @@ +"""Migrate batch strategy enum to lookup table + +Revision ID: 3a9f8e1d5b2c +Revises: 1fb2286a016c +Create Date: 2026-02-27 12:15:00.000000 + +""" +from typing import Sequence, Union + +import sqlalchemy as sa +from alembic import op +from sqlalchemy.dialects import postgresql + +# revision identifiers, used by Alembic. +revision: str = '3a9f8e1d5b2c' +down_revision: Union[str, None] = '1fb2286a016c' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + +STRATEGY_VALUES = [ + "example", + "ckan", + "muckrock_county_search", + "auto_googler", + "muckrock_all_search", + "muckrock_simple_search", + "common_crawler", + "manual", + "internet_archive", +] + + +def upgrade() -> None: + op.create_table( + "batch_strategies", + sa.Column("id", sa.Integer(), autoincrement=True, nullable=False), + sa.Column("name", sa.String(), nullable=False), + sa.PrimaryKeyConstraint("id"), + sa.UniqueConstraint("name"), + ) + + values_sql = ", ".join(f"('{value}')" for value in STRATEGY_VALUES) + op.execute(f"INSERT INTO batch_strategies (name) VALUES {values_sql}") + + op.add_column( + "batches", + sa.Column("batch_strategy_id", sa.Integer(), nullable=True), + ) + op.execute(""" + UPDATE batches AS b + SET batch_strategy_id = bs.id + FROM batch_strategies AS bs + WHERE b.strategy::text = bs.name + """) + op.alter_column("batches", "batch_strategy_id", nullable=False) + op.create_foreign_key( + "fk_batches_batch_strategy_id", + "batches", + "batch_strategies", + ["batch_strategy_id"], + ["id"], + ) + op.create_index( + "ix_batches_batch_strategy_id", + "batches", + ["batch_strategy_id"], + unique=False, + ) + + op.drop_column("batches", "strategy") + op.execute("DROP TYPE IF EXISTS batch_strategy") + + +def downgrade() -> None: + batch_strategy_enum = postgresql.ENUM( + *STRATEGY_VALUES, + name="batch_strategy", + ) + batch_strategy_enum.create(op.get_bind(), checkfirst=True) + + op.add_column( + "batches", + sa.Column("strategy", batch_strategy_enum, nullable=True), + ) + op.execute(""" + UPDATE batches AS b + SET strategy = bs.name::batch_strategy + FROM batch_strategies AS bs + WHERE b.batch_strategy_id = bs.id + """) + op.alter_column("batches", "strategy", nullable=False) + + op.drop_index("ix_batches_batch_strategy_id", table_name="batches") + op.drop_constraint("fk_batches_batch_strategy_id", "batches", type_="foreignkey") + op.drop_column("batches", "batch_strategy_id") + op.drop_table("batch_strategies") diff --git a/pyproject.toml b/pyproject.toml index 415094ff..de3b9206 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -57,6 +57,7 @@ dev = [ "pytest-asyncio~=0.25.2", "pytest-mock==3.12.0", "pytest-timeout~=2.3.1", + "ruff>=0.15.4", "vulture>=2.14", ] diff --git a/src/api/endpoints/collector/manual/query.py b/src/api/endpoints/collector/manual/query.py index 8216b10b..6eb5186c 100644 --- a/src/api/endpoints/collector/manual/query.py +++ b/src/api/endpoints/collector/manual/query.py @@ -1,3 +1,4 @@ +from sqlalchemy import select from sqlalchemy.exc import IntegrityError from sqlalchemy.ext.asyncio import AsyncSession @@ -6,6 +7,7 @@ from src.collectors.enums import CollectorType from src.core.enums import BatchStatus from src.db.models.impl.batch.sqlalchemy import Batch +from src.db.models.impl.batch.strategy.sqlalchemy import BatchStrategy from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL from src.db.models.impl.url.core.enums import URLSource from src.db.models.impl.url.core.sqlalchemy import URL @@ -29,8 +31,14 @@ def __init__( async def run(self, session: AsyncSession) -> ManualBatchResponseDTO: + strategy_id: int | None = await session.scalar( + select(BatchStrategy.id).where(BatchStrategy.name == CollectorType.MANUAL.value) + ) + if strategy_id is None: + raise ValueError(f"Unknown batch strategy: {CollectorType.MANUAL.value}") + batch = Batch( - strategy=CollectorType.MANUAL.value, + batch_strategy_id=strategy_id, status=BatchStatus.READY_TO_LABEL.value, parameters={ "name": self.dto.name @@ -95,4 +103,4 @@ async def run(self, session: AsyncSession) -> ManualBatchResponseDTO: batch_id=batch_id, urls=url_ids, duplicate_urls=duplicate_urls - ) \ No newline at end of file + ) diff --git a/src/api/endpoints/metrics/batches/aggregated/query/all_urls/query.py b/src/api/endpoints/metrics/batches/aggregated/query/all_urls/query.py index 7eed215a..7df43c60 100644 --- a/src/api/endpoints/metrics/batches/aggregated/query/all_urls/query.py +++ b/src/api/endpoints/metrics/batches/aggregated/query/all_urls/query.py @@ -6,6 +6,7 @@ from src.api.endpoints.metrics.batches.aggregated.query.models.strategy_count import CountByBatchStrategyResponse from src.db.helpers.session import session_helper as sh from src.db.models.impl.batch.sqlalchemy import Batch +from src.db.models.impl.batch.strategy.sqlalchemy import BatchStrategy from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL from src.db.queries.base.builder import QueryBuilderBase @@ -16,13 +17,18 @@ async def run(self, session: AsyncSession) -> list[CountByBatchStrategyResponse] query = ( select( - Batch.strategy, + BatchStrategy.name.label("strategy"), func.count(LinkBatchURL.url_id).label("count") ) + .select_from(Batch) + .join( + BatchStrategy, + Batch.batch_strategy_id == BatchStrategy.id, + ) .join(LinkBatchURL) - .group_by(Batch.strategy) + .group_by(BatchStrategy.name) ) mappings: Sequence[RowMapping] = await sh.mappings(session, query=query) results = [CountByBatchStrategyResponse(**mapping) for mapping in mappings] - return results \ No newline at end of file + return results diff --git a/src/api/endpoints/metrics/batches/aggregated/query/batch_status_/query.py b/src/api/endpoints/metrics/batches/aggregated/query/batch_status_/query.py index f8587b68..5e984452 100644 --- a/src/api/endpoints/metrics/batches/aggregated/query/batch_status_/query.py +++ b/src/api/endpoints/metrics/batches/aggregated/query/batch_status_/query.py @@ -1,6 +1,6 @@ from typing import Sequence -from sqlalchemy import CTE, select, func, RowMapping +from sqlalchemy import select, func, RowMapping from sqlalchemy.ext.asyncio import AsyncSession from src.api.endpoints.metrics.batches.aggregated.query.batch_status_.response import \ @@ -8,6 +8,7 @@ from src.collectors.enums import CollectorType from src.core.enums import BatchStatus from src.db.models.impl.batch.sqlalchemy import Batch +from src.db.models.impl.batch.strategy.sqlalchemy import BatchStrategy from src.db.queries.base.builder import QueryBuilderBase from src.db.helpers.session import session_helper as sh @@ -17,11 +18,16 @@ class BatchStatusByBatchStrategyQueryBuilder(QueryBuilderBase): async def run(self, session: AsyncSession) -> list[BatchStatusCountByBatchStrategyResponseDTO]: query = ( select( - Batch.strategy, + BatchStrategy.name.label("strategy"), Batch.status, func.count(Batch.id).label("count") ) - .group_by(Batch.strategy, Batch.status) + .select_from(Batch) + .join( + BatchStrategy, + Batch.batch_strategy_id == BatchStrategy.id, + ) + .group_by(BatchStrategy.name, Batch.status) ) mappings: Sequence[RowMapping] = await sh.mappings(session, query=query) @@ -34,4 +40,4 @@ async def run(self, session: AsyncSession) -> list[BatchStatusCountByBatchStrate count=mapping["count"] ) ) - return results \ No newline at end of file + return results diff --git a/src/api/endpoints/metrics/batches/aggregated/query/pending/query.py b/src/api/endpoints/metrics/batches/aggregated/query/pending/query.py index 224d3bad..eb7b0ff6 100644 --- a/src/api/endpoints/metrics/batches/aggregated/query/pending/query.py +++ b/src/api/endpoints/metrics/batches/aggregated/query/pending/query.py @@ -5,6 +5,7 @@ from src.api.endpoints.metrics.batches.aggregated.query.models.strategy_count import CountByBatchStrategyResponse from src.db.models.impl.batch.sqlalchemy import Batch +from src.db.models.impl.batch.strategy.sqlalchemy import BatchStrategy from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL from src.db.queries.base.builder import QueryBuilderBase @@ -17,9 +18,14 @@ async def run( query = ( select( - Batch.strategy, + BatchStrategy.name.label("strategy"), func.count(LinkBatchURL.url_id).label("count") ) + .select_from(Batch) + .join( + BatchStrategy, + Batch.batch_strategy_id == BatchStrategy.id, + ) .join( LinkBatchURL, LinkBatchURL.batch_id == Batch.id @@ -29,7 +35,7 @@ async def run( FlagURLValidated.url_id == LinkBatchURL.url_id ) .where(FlagURLValidated.url_id.is_(None)) - .group_by(Batch.strategy) + .group_by(BatchStrategy.name) ) mappings: Sequence[RowMapping] = await sh.mappings(session, query=query) diff --git a/src/api/endpoints/metrics/batches/aggregated/query/rejected/query.py b/src/api/endpoints/metrics/batches/aggregated/query/rejected/query.py index 7b94f2ba..3a60ca32 100644 --- a/src/api/endpoints/metrics/batches/aggregated/query/rejected/query.py +++ b/src/api/endpoints/metrics/batches/aggregated/query/rejected/query.py @@ -5,6 +5,7 @@ from src.api.endpoints.metrics.batches.aggregated.query.models.strategy_count import CountByBatchStrategyResponse from src.db.models.impl.batch.sqlalchemy import Batch +from src.db.models.impl.batch.strategy.sqlalchemy import BatchStrategy from src.db.models.impl.flag.url_validated.enums import URLType from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL @@ -19,9 +20,14 @@ async def run( query = ( select( - Batch.strategy, + BatchStrategy.name.label("strategy"), func.count(FlagURLValidated.url_id).label("count") ) + .select_from(Batch) + .join( + BatchStrategy, + Batch.batch_strategy_id == BatchStrategy.id, + ) .join( LinkBatchURL, LinkBatchURL.batch_id == Batch.id @@ -31,7 +37,7 @@ async def run( FlagURLValidated.url_id == LinkBatchURL.url_id ) .where(FlagURLValidated.type == URLType.NOT_RELEVANT) - .group_by(Batch.strategy) + .group_by(BatchStrategy.name) ) mappings: Sequence[RowMapping] = await sh.mappings(session, query=query) diff --git a/src/api/endpoints/metrics/batches/aggregated/query/submitted_/query.py b/src/api/endpoints/metrics/batches/aggregated/query/submitted_/query.py index e3fa9d14..02f84b4d 100644 --- a/src/api/endpoints/metrics/batches/aggregated/query/submitted_/query.py +++ b/src/api/endpoints/metrics/batches/aggregated/query/submitted_/query.py @@ -7,6 +7,7 @@ from src.collectors.enums import CollectorType from src.db.helpers.session import session_helper as sh from src.db.models.impl.batch.sqlalchemy import Batch +from src.db.models.impl.batch.strategy.sqlalchemy import BatchStrategy from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL from src.db.models.impl.url.data_source.sqlalchemy import DSAppLinkDataSource from src.db.queries.base.builder import QueryBuilderBase @@ -19,9 +20,14 @@ async def run(self, session: AsyncSession) -> list[ ]: query = ( select( - Batch.strategy, + BatchStrategy.name.label("strategy"), func.count(DSAppLinkDataSource.id).label("count") ) + .select_from(Batch) + .join( + BatchStrategy, + Batch.batch_strategy_id == BatchStrategy.id, + ) .join( LinkBatchURL, LinkBatchURL.batch_id == Batch.id @@ -30,7 +36,7 @@ async def run(self, session: AsyncSession) -> list[ DSAppLinkDataSource, DSAppLinkDataSource.url_id == LinkBatchURL.url_id ) - .group_by(Batch.strategy) + .group_by(BatchStrategy.name) ) mappings: Sequence[RowMapping] = await sh.mappings(session, query=query) diff --git a/src/api/endpoints/metrics/batches/aggregated/query/url_error/query.py b/src/api/endpoints/metrics/batches/aggregated/query/url_error/query.py index 6712c76d..c76776c6 100644 --- a/src/api/endpoints/metrics/batches/aggregated/query/url_error/query.py +++ b/src/api/endpoints/metrics/batches/aggregated/query/url_error/query.py @@ -7,6 +7,7 @@ from src.db.helpers.query import exists_url from src.db.helpers.session import session_helper as sh from src.db.models.impl.batch.sqlalchemy import Batch +from src.db.models.impl.batch.strategy.sqlalchemy import BatchStrategy from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL from src.db.models.impl.url.core.sqlalchemy import URL from src.db.models.impl.url.task_error.sqlalchemy import URLTaskError @@ -18,20 +19,22 @@ class URLErrorByBatchStrategyQueryBuilder(QueryBuilderBase): async def run(self, session: AsyncSession) -> list[CountByBatchStrategyResponse]: query = ( select( - Batch.strategy, + BatchStrategy.name.label("strategy"), func.count(URL.id).label("count") ) .select_from(Batch) + .join( + BatchStrategy, + Batch.batch_strategy_id == BatchStrategy.id, + ) .join(LinkBatchURL) .join(URL) .where( exists_url(URLTaskError) ) - .group_by(Batch.strategy) + .group_by(BatchStrategy.name) ) mappings: Sequence[RowMapping] = await sh.mappings(session, query=query) results = [CountByBatchStrategyResponse(**mapping) for mapping in mappings] return results - - diff --git a/src/api/endpoints/metrics/batches/aggregated/query/validated_/query.py b/src/api/endpoints/metrics/batches/aggregated/query/validated_/query.py index 155cbcb0..c86a4cde 100644 --- a/src/api/endpoints/metrics/batches/aggregated/query/validated_/query.py +++ b/src/api/endpoints/metrics/batches/aggregated/query/validated_/query.py @@ -6,6 +6,7 @@ from src.api.endpoints.metrics.batches.aggregated.query.models.strategy_count import CountByBatchStrategyResponse from src.db.helpers.session import session_helper as sh from src.db.models.impl.batch.sqlalchemy import Batch +from src.db.models.impl.batch.strategy.sqlalchemy import BatchStrategy from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL from src.db.queries.base.builder import QueryBuilderBase @@ -19,9 +20,14 @@ async def run( query = ( select( - Batch.strategy, + BatchStrategy.name.label("strategy"), func.count(FlagURLValidated.url_id).label("count") ) + .select_from(Batch) + .join( + BatchStrategy, + Batch.batch_strategy_id == BatchStrategy.id, + ) .join( LinkBatchURL, LinkBatchURL.batch_id == Batch.id @@ -30,7 +36,7 @@ async def run( FlagURLValidated, FlagURLValidated.url_id == LinkBatchURL.url_id ) - .group_by(Batch.strategy) + .group_by(BatchStrategy.name) ) mappings: Sequence[RowMapping] = await sh.mappings(session, query=query) diff --git a/src/api/endpoints/metrics/batches/breakdown/query.py b/src/api/endpoints/metrics/batches/breakdown/query.py index d46a01b9..c12da345 100644 --- a/src/api/endpoints/metrics/batches/breakdown/query.py +++ b/src/api/endpoints/metrics/batches/breakdown/query.py @@ -14,8 +14,8 @@ from src.collectors.enums import CollectorType from src.core.enums import BatchStatus from src.db.models.impl.batch.sqlalchemy import Batch +from src.db.models.impl.batch.strategy.sqlalchemy import BatchStrategy from src.db.queries.base.builder import QueryBuilderBase -from src.db.statement_composer import StatementComposer class GetBatchesBreakdownMetricsQueryBuilder(QueryBuilderBase): @@ -28,13 +28,14 @@ def __init__( self.page = page async def run(self, session: AsyncSession) -> GetMetricsBatchesBreakdownResponseDTO: - sc = StatementComposer - main_query = select( - Batch.strategy, + BatchStrategy.name.label("strategy"), Batch.id, Batch.status, Batch.date_generated.label("created_at"), + ).join( + BatchStrategy, + Batch.batch_strategy_id == BatchStrategy.id, ) all_ctes: list[BatchesBreakdownURLCTE] = [ @@ -103,4 +104,4 @@ async def run(self, session: AsyncSession) -> GetMetricsBatchesBreakdownResponse batches.append(dto) return GetMetricsBatchesBreakdownResponseDTO( batches=batches, - ) \ No newline at end of file + ) diff --git a/src/api/endpoints/submit/data_source/queries/core.py b/src/api/endpoints/submit/data_source/queries/core.py index eed12aca..c0a5df43 100644 --- a/src/api/endpoints/submit/data_source/queries/core.py +++ b/src/api/endpoints/submit/data_source/queries/core.py @@ -1,5 +1,6 @@ import uuid +from sqlalchemy import select from sqlalchemy.ext.asyncio import AsyncSession from src.api.endpoints.submit.data_source.models.response.standard import SubmitDataSourceURLProposalResponse @@ -12,6 +13,7 @@ from src.db.models.impl.annotation.record_type.anon.sqlalchemy import AnnotationRecordTypeAnon from src.db.models.impl.annotation.url_type.anon.sqlalchemy import AnnotationURLTypeAnon from src.db.models.impl.batch.sqlalchemy import Batch +from src.db.models.impl.batch.strategy.sqlalchemy import BatchStrategy from src.db.models.impl.flag.url_validated.enums import URLType from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL from src.db.models.impl.url.core.enums import URLSource @@ -51,8 +53,14 @@ async def run( url_id: int = url.id # Add Batch + strategy_id: int | None = await session.scalar( + select(BatchStrategy.id).where(BatchStrategy.name == "manual") + ) + if strategy_id is None: + raise ValueError("Unknown batch strategy: manual") + batch = Batch( - strategy='manual', + batch_strategy_id=strategy_id, status=BatchStatus.READY_TO_LABEL, parameters={} ) diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/subtask/impl/ckan.py b/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/subtask/impl/ckan.py index 052a5fb3..88a87b35 100644 --- a/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/subtask/impl/ckan.py +++ b/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/subtask/impl/ckan.py @@ -6,6 +6,7 @@ get_exists_subtask_query from src.db.models.impl.annotation.agency.auto.subtask.enum import AutoAgencyIDSubtaskType from src.db.models.impl.batch.sqlalchemy import Batch +from src.db.models.impl.batch.strategy.sqlalchemy import BatchStrategy from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL from src.db.models.impl.url.core.sqlalchemy import URL @@ -24,8 +25,12 @@ Batch, Batch.id == LinkBatchURL.batch_id, ) + .join( + BatchStrategy, + Batch.batch_strategy_id == BatchStrategy.id, + ) .where( - Batch.strategy == CollectorType.CKAN.value, + BatchStrategy.name == CollectorType.CKAN.value, ) .cte("ckan_eligible") @@ -33,4 +38,4 @@ CKAN_SUBTASK_CONTAINER = URLsSubtaskEligibleCTEContainer( cte, -) \ No newline at end of file +) diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/subtask/impl/muckrock.py b/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/subtask/impl/muckrock.py index 5a83e029..1f321d58 100644 --- a/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/subtask/impl/muckrock.py +++ b/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/subtask/impl/muckrock.py @@ -6,6 +6,7 @@ get_exists_subtask_query from src.db.models.impl.annotation.agency.auto.subtask.enum import AutoAgencyIDSubtaskType from src.db.models.impl.batch.sqlalchemy import Batch +from src.db.models.impl.batch.strategy.sqlalchemy import BatchStrategy from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL from src.db.models.impl.url.core.sqlalchemy import URL @@ -24,8 +25,12 @@ Batch, Batch.id == LinkBatchURL.batch_id, ) + .join( + BatchStrategy, + Batch.batch_strategy_id == BatchStrategy.id, + ) .where( - Batch.strategy.in_( + BatchStrategy.name.in_( (CollectorType.MUCKROCK_ALL_SEARCH.value, CollectorType.MUCKROCK_COUNTY_SEARCH.value, CollectorType.MUCKROCK_SIMPLE_SEARCH.value,) @@ -36,4 +41,4 @@ MUCKROCK_SUBTASK_CONTAINER = URLsSubtaskEligibleCTEContainer( cte, -) \ No newline at end of file +) diff --git a/src/core/tasks/url/operators/misc_metadata/queries/get_pending_urls_missing_miscellaneous_data.py b/src/core/tasks/url/operators/misc_metadata/queries/get_pending_urls_missing_miscellaneous_data.py index 0efbfceb..2f1aabc4 100644 --- a/src/core/tasks/url/operators/misc_metadata/queries/get_pending_urls_missing_miscellaneous_data.py +++ b/src/core/tasks/url/operators/misc_metadata/queries/get_pending_urls_missing_miscellaneous_data.py @@ -3,6 +3,7 @@ from src.collectors.enums import CollectorType from src.core.tasks.url.operators.misc_metadata.tdo import URLMiscellaneousMetadataTDO, URLHTMLMetadataInfo +from src.db.models.impl.batch.sqlalchemy import Batch from src.db.models.impl.url.html.content.enums import HTMLContentType from src.db.models.impl.url.core.sqlalchemy import URL from src.db.queries.base.builder import QueryBuilderBase @@ -16,7 +17,7 @@ async def run(self, session: AsyncSession) -> list[URLMiscellaneousMetadataTDO]: query = StatementComposer.pending_urls_missing_miscellaneous_metadata_query() query = ( query.options( - selectinload(URL.batch), + selectinload(URL.batch).selectinload(Batch.batch_strategy), selectinload(URL.html_content) ).limit(100).order_by(URL.id) ) @@ -28,7 +29,7 @@ async def run(self, session: AsyncSession) -> list[URLMiscellaneousMetadataTDO]: tdo = URLMiscellaneousMetadataTDO( url_id=result.id, collector_metadata=result.collector_metadata or {}, - collector_type=CollectorType(result.batch.strategy), + collector_type=CollectorType(result.batch.batch_strategy.name), ) html_info = URLHTMLMetadataInfo() for html_content in result.html_content: diff --git a/src/db/client/async_.py b/src/db/client/async_.py index 6377fa60..830bcdca 100644 --- a/src/db/client/async_.py +++ b/src/db/client/async_.py @@ -60,6 +60,7 @@ from src.db.models.impl.backlog_snapshot import BacklogSnapshot from src.db.models.impl.batch.pydantic.info import BatchInfo from src.db.models.impl.batch.sqlalchemy import Batch +from src.db.models.impl.batch.strategy.sqlalchemy import BatchStrategy from src.db.models.impl.duplicate.pydantic.info import DuplicateInfo from src.db.models.impl.flag.url_validated.enums import URLType from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated @@ -632,8 +633,14 @@ async def insert_batch( batch_info: BatchInfo ) -> int: """Insert a new batch into the database and return its ID.""" + batch_strategy_id: int | None = await session.scalar( + select(BatchStrategy.id).where(BatchStrategy.name == batch_info.strategy) + ) + if batch_strategy_id is None: + raise ValueError(f"Unknown batch strategy: {batch_info.strategy}") + batch = Batch( - strategy=batch_info.strategy, + batch_strategy_id=batch_strategy_id, user_id=batch_info.user_id, status=batch_info.status.value, parameters=batch_info.parameters, @@ -907,4 +914,4 @@ async def refresh_materialized_views(self): ) await self.execute( text("REFRESH MATERIALIZED VIEW mat_view__html_duplicate_url") - ) \ No newline at end of file + ) diff --git a/src/db/client/sync.py b/src/db/client/sync.py index c5d90167..5d17dfe5 100644 --- a/src/db/client/sync.py +++ b/src/db/client/sync.py @@ -12,6 +12,7 @@ from src.db.dtos.url.mapping_.simple import SimpleURLMapping from src.db.models.impl.batch.pydantic.info import BatchInfo from src.db.models.impl.batch.sqlalchemy import Batch +from src.db.models.impl.batch.strategy.sqlalchemy import BatchStrategy from src.db.models.impl.duplicate.pydantic.insert import DuplicateInsertInfo from src.db.models.impl.duplicate.sqlalchemy import Duplicate from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL @@ -69,8 +70,12 @@ def add_all(self, session: Session, objects: list[Base]): @session_manager def insert_batch(self, session: Session, batch_info: BatchInfo) -> int: """Insert a new batch into the database and return its ID.""" + strategy = session.query(BatchStrategy).filter_by(name=batch_info.strategy).first() + if strategy is None: + raise ValueError(f"Unknown batch strategy: {batch_info.strategy}") + batch = Batch( - strategy=batch_info.strategy, + batch_strategy_id=strategy.id, user_id=batch_info.user_id, status=batch_info.status.value, parameters=batch_info.parameters, @@ -91,7 +96,19 @@ def get_batch_by_id( ) -> BatchInfo | None: """Retrieve a batch by ID.""" batch = session.query(Batch).filter_by(id=batch_id).first() - return BatchInfo(**batch.__dict__) + if batch is None: + return None + + return BatchInfo( + id=batch.id, + strategy=batch.batch_strategy.name, + status=BatchStatus(batch.status), + parameters=batch.parameters, + user_id=batch.user_id, + total_url_count=getattr(batch, "total_url_count", None), + compute_time=batch.compute_time, + date_generated=batch.date_generated, + ) @session_manager @@ -149,7 +166,7 @@ def insert_urls(self, url_infos: List[URLInfo], batch_id: int) -> InsertURLsInfo try: url_id = self.insert_url(url_info) url_mappings.append(SimpleURLMapping(url_id=url_id, url=url_info.url)) - except IntegrityError as e: + except IntegrityError: orig_url_info = self.get_url_info_by_url(url_info.url) duplicate_info = DuplicateInsertInfo( batch_id=batch_id, diff --git a/src/db/models/impl/batch/sqlalchemy.py b/src/db/models/impl/batch/sqlalchemy.py index bfb9b894..7c78f38f 100644 --- a/src/db/models/impl/batch/sqlalchemy.py +++ b/src/db/models/impl/batch/sqlalchemy.py @@ -1,5 +1,4 @@ -from sqlalchemy import Column, Integer, TIMESTAMP, Float, JSON -from sqlalchemy.dialects import postgresql +from sqlalchemy import Column, Integer, TIMESTAMP, Float, JSON, ForeignKey from sqlalchemy.orm import relationship, Mapped from src.core.enums import BatchStatus @@ -12,19 +11,12 @@ class Batch(WithIDBase): __tablename__ = 'batches' - strategy = Column( - postgresql.ENUM( - 'example', - 'ckan', - 'muckrock_county_search', - 'auto_googler', - 'muckrock_all_search', - 'muckrock_simple_search', - 'common_crawler', - 'manual', - 'internet_archive', - name='batch_strategy'), - nullable=False) + batch_strategy_id = Column( + Integer, + ForeignKey("batch_strategies.id"), + nullable=False, + index=True, + ) user_id = Column(Integer, nullable=True) # Gives the status of the batch status: Mapped[BatchStatus] = Column( @@ -40,6 +32,11 @@ class Batch(WithIDBase): parameters = Column(JSON) # Relationships + batch_strategy = relationship( + "BatchStrategy", + back_populates="batches", + lazy="joined", + ) urls = relationship( "URL", secondary="link_batches__urls", @@ -50,3 +47,9 @@ class Batch(WithIDBase): # missings = relationship("Missing", back_populates="batch") logs = relationship(Log, back_populates="batch") duplicates = relationship("Duplicate", back_populates="batch") + + @property + def strategy(self) -> str | None: + if self.batch_strategy is None: + return None + return self.batch_strategy.name diff --git a/src/db/models/impl/batch/strategy/__init__.py b/src/db/models/impl/batch/strategy/__init__.py new file mode 100644 index 00000000..8b137891 --- /dev/null +++ b/src/db/models/impl/batch/strategy/__init__.py @@ -0,0 +1 @@ + diff --git a/src/db/models/impl/batch/strategy/sqlalchemy.py b/src/db/models/impl/batch/strategy/sqlalchemy.py new file mode 100644 index 00000000..7a8dad24 --- /dev/null +++ b/src/db/models/impl/batch/strategy/sqlalchemy.py @@ -0,0 +1,15 @@ +from sqlalchemy import Column, String +from sqlalchemy.orm import relationship + +from src.db.models.templates_.with_id import WithIDBase + + +class BatchStrategy(WithIDBase): + __tablename__ = "batch_strategies" + + name = Column(String, nullable=False, unique=True) + + batches = relationship( + "Batch", + back_populates="batch_strategy", + ) diff --git a/src/db/queries/implementations/core/get/recent_batch_summaries/builder.py b/src/db/queries/implementations/core/get/recent_batch_summaries/builder.py index f5696e7e..33237fe0 100644 --- a/src/db/queries/implementations/core/get/recent_batch_summaries/builder.py +++ b/src/db/queries/implementations/core/get/recent_batch_summaries/builder.py @@ -6,6 +6,7 @@ from src.api.endpoints.batch.dtos.get.summaries.summary import BatchSummary from src.collectors.enums import CollectorType from src.db.models.impl.batch.sqlalchemy import Batch +from src.db.models.impl.batch.strategy.sqlalchemy import BatchStrategy from src.db.models.materialized_views.batch_url_status.core import BatchURLStatusMaterializedView from src.db.models.materialized_views.batch_url_status.enums import BatchURLStatusViewEnum from src.db.queries.base.builder import QueryBuilderBase @@ -38,7 +39,7 @@ async def run(self, session: AsyncSession) -> list[BatchSummary]: query = ( Select( *builder.get_all(), - Batch.strategy, + BatchStrategy.name.label("strategy"), Batch.status, BatchURLStatusMaterializedView.batch_url_status, Batch.parameters, @@ -48,6 +49,9 @@ async def run(self, session: AsyncSession) -> list[BatchSummary]: ).join( builder.query, builder.get(count_labels.batch_id) == Batch.id, + ).join( + BatchStrategy, + Batch.batch_strategy_id == BatchStrategy.id, ).outerjoin( BatchURLStatusMaterializedView, BatchURLStatusMaterializedView.batch_id == Batch.id, diff --git a/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/builder.py b/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/builder.py index 7192f1fa..20e8618b 100644 --- a/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/builder.py +++ b/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/builder.py @@ -3,6 +3,7 @@ from src.collectors.enums import CollectorType from src.db.models.impl.batch.sqlalchemy import Batch +from src.db.models.impl.batch.strategy.sqlalchemy import BatchStrategy from src.db.models.materialized_views.batch_url_status.core import BatchURLStatusMaterializedView from src.db.models.materialized_views.batch_url_status.enums import BatchURLStatusViewEnum from src.db.queries.base.builder import QueryBuilderBase @@ -43,6 +44,10 @@ def get_core_query(self): func.coalesce(ERROR_CTE.count, 0).label(labels.error), ) .select_from(Batch) + .join( + BatchStrategy, + Batch.batch_strategy_id == BatchStrategy.id, + ) .join( BatchURLStatusMaterializedView, BatchURLStatusMaterializedView.batch_id == Batch.id, @@ -73,7 +78,7 @@ def apply_batch_id_filter(self, query: Select): def apply_collector_type_filter(self, query: Select): if self.collector_type is None: return query - return query.where(Batch.strategy == self.collector_type.value) + return query.where(BatchStrategy.name == self.collector_type.value) def apply_status_filter(self, query: Select): if self.status is None: diff --git a/tests/automated/integration/db/structure/test_batch.py b/tests/automated/integration/db/structure/test_batch.py index f905b178..54856e57 100644 --- a/tests/automated/integration/db/structure/test_batch.py +++ b/tests/automated/integration/db/structure/test_batch.py @@ -1,8 +1,7 @@ import sqlalchemy as sa -from sqlalchemy import create_engine +from sqlalchemy import create_engine, text from sqlalchemy.dialects import postgresql -from src.collectors.enums import CollectorType from src.core.enums import BatchStatus from src.db.helpers.connect import get_postgres_connection_string from src.util.helper_functions import get_enum_values @@ -12,13 +11,22 @@ def test_batch(wiped_database): engine = create_engine(get_postgres_connection_string()) + + # Get valid batch_strategy_id from lookup table + with engine.connect() as conn: + strategy_ids = [ + row[0] for row in conn.execute( + text("SELECT id FROM batch_strategies ORDER BY id") + ).fetchall() + ] + table_tester = TableTester( table_name="batches", columns=[ ColumnTester( - column_name="strategy", - type_=postgresql.ENUM, - allowed_values=get_enum_values(CollectorType), + column_name="batch_strategy_id", + type_=sa.Integer, + allowed_values=strategy_ids, ), ColumnTester( column_name="user_id", diff --git a/tests/helpers/data_creator/create.py b/tests/helpers/data_creator/create.py index 1c2073fd..08a18f3e 100644 --- a/tests/helpers/data_creator/create.py +++ b/tests/helpers/data_creator/create.py @@ -5,7 +5,7 @@ from src.db import County, Locality, USState from src.db.client.async_ import AsyncDatabaseClient from src.db.dtos.url.mapping_.simple import SimpleURLMapping -from src.db.models.impl.batch.pydantic.insert import BatchInsertModel +from src.db.models.impl.batch.pydantic.info import BatchInfo from src.db.models.impl.flag.url_validated.enums import URLType from src.db.models.impl.flag.url_validated.pydantic import FlagURLValidatedPydantic from src.db.models.impl.link.batch_url.pydantic import LinkBatchURLPydantic @@ -14,7 +14,7 @@ from src.db.models.impl.url.data_source.pydantic import URLDataSourcePydantic from src.db.models.impl.url.record_type.pydantic import URLRecordTypePydantic from tests.helpers.counter import next_int -from tests.helpers.data_creator.generate import generate_batch, generate_urls, generate_validated_flags, \ +from tests.helpers.data_creator.generate import generate_urls, generate_validated_flags, \ generate_url_data_sources, generate_batch_url_links from tests.helpers.data_creator.models.creation_info.county import CountyCreationInfo from tests.helpers.data_creator.models.creation_info.locality import LocalityCreationInfo @@ -27,8 +27,15 @@ async def create_batch( strategy: CollectorType = CollectorType.EXAMPLE, date_generated: datetime = datetime.now(), ) -> int: - batch: BatchInsertModel = generate_batch(status=status, strategy=strategy, date_generated=date_generated) - return (await adb_client.bulk_insert([batch], return_ids=True))[0] + return await adb_client.insert_batch( + BatchInfo( + strategy=strategy.value, + status=status, + parameters={}, + user_id=1, + date_generated=date_generated, + ) + ) async def create_urls( adb_client: AsyncDatabaseClient, @@ -153,4 +160,4 @@ async def create_locality( return LocalityCreationInfo( locality_id=locality_id, location_id=location_id, - ) \ No newline at end of file + ) diff --git a/tests/helpers/setup/wipe.py b/tests/helpers/setup/wipe.py index 7d4f0672..602bc916 100644 --- a/tests/helpers/setup/wipe.py +++ b/tests/helpers/setup/wipe.py @@ -2,6 +2,10 @@ from src.db.models.templates_.base import Base +PRESERVED_TABLES = { + "batch_strategies", +} + def wipe_database(engine: Engine) -> None: """Wipe all data from database.""" @@ -9,5 +13,7 @@ def wipe_database(engine: Engine) -> None: for table in reversed(Base.metadata.sorted_tables): if table.info == "view": continue + if table.name in PRESERVED_TABLES: + continue connection.execute(table.delete()) connection.commit() diff --git a/uv.lock b/uv.lock index 0a04d3a2..28183bc7 100644 --- a/uv.lock +++ b/uv.lock @@ -541,6 +541,7 @@ dev = [ { name = "pytest-asyncio" }, { name = "pytest-mock" }, { name = "pytest-timeout" }, + { name = "ruff" }, { name = "vulture" }, ] @@ -600,6 +601,7 @@ dev = [ { name = "pytest-asyncio", specifier = "~=0.25.2" }, { name = "pytest-mock", specifier = "==3.12.0" }, { name = "pytest-timeout", specifier = "~=2.3.1" }, + { name = "ruff", specifier = ">=0.15.4" }, { name = "vulture", specifier = ">=2.14" }, ] @@ -2536,6 +2538,31 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/64/8d/0133e4eb4beed9e425d9a98ed6e081a55d195481b7632472be1af08d2f6b/rsa-4.9.1-py3-none-any.whl", hash = "sha256:68635866661c6836b8d39430f97a996acbd61bfa49406748ea243539fe239762", size = 34696, upload-time = "2025-04-16T09:51:17.142Z" }, ] +[[package]] +name = "ruff" +version = "0.15.5" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/77/9b/840e0039e65fcf12758adf684d2289024d6140cde9268cc59887dc55189c/ruff-0.15.5.tar.gz", hash = "sha256:7c3601d3b6d76dce18c5c824fc8d06f4eef33d6df0c21ec7799510cde0f159a2", size = 4574214, upload-time = "2026-03-05T20:06:34.946Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/47/20/5369c3ce21588c708bcbe517a8fbe1a8dfdb5dfd5137e14790b1da71612c/ruff-0.15.5-py3-none-linux_armv6l.whl", hash = "sha256:4ae44c42281f42e3b06b988e442d344a5b9b72450ff3c892e30d11b29a96a57c", size = 10478185, upload-time = "2026-03-05T20:06:29.093Z" }, + { url = "https://files.pythonhosted.org/packages/44/ed/e81dd668547da281e5dce710cf0bc60193f8d3d43833e8241d006720e42b/ruff-0.15.5-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:6edd3792d408ebcf61adabc01822da687579a1a023f297618ac27a5b51ef0080", size = 10859201, upload-time = "2026-03-05T20:06:32.632Z" }, + { url = "https://files.pythonhosted.org/packages/c4/8f/533075f00aaf19b07c5cd6aa6e5d89424b06b3b3f4583bfa9c640a079059/ruff-0.15.5-py3-none-macosx_11_0_arm64.whl", hash = "sha256:89f463f7c8205a9f8dea9d658d59eff49db05f88f89cc3047fb1a02d9f344010", size = 10184752, upload-time = "2026-03-05T20:06:40.312Z" }, + { url = "https://files.pythonhosted.org/packages/66/0e/ba49e2c3fa0395b3152bad634c7432f7edfc509c133b8f4529053ff024fb/ruff-0.15.5-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ba786a8295c6574c1116704cf0b9e6563de3432ac888d8f83685654fe528fd65", size = 10534857, upload-time = "2026-03-05T20:06:19.581Z" }, + { url = "https://files.pythonhosted.org/packages/59/71/39234440f27a226475a0659561adb0d784b4d247dfe7f43ffc12dd02e288/ruff-0.15.5-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:fd4b801e57955fe9f02b31d20375ab3a5c4415f2e5105b79fb94cf2642c91440", size = 10309120, upload-time = "2026-03-05T20:06:00.435Z" }, + { url = "https://files.pythonhosted.org/packages/f5/87/4140aa86a93df032156982b726f4952aaec4a883bb98cb6ef73c347da253/ruff-0.15.5-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:391f7c73388f3d8c11b794dbbc2959a5b5afe66642c142a6effa90b45f6f5204", size = 11047428, upload-time = "2026-03-05T20:05:51.867Z" }, + { url = "https://files.pythonhosted.org/packages/5a/f7/4953e7e3287676f78fbe85e3a0ca414c5ca81237b7575bdadc00229ac240/ruff-0.15.5-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8dc18f30302e379fe1e998548b0f5e9f4dff907f52f73ad6da419ea9c19d66c8", size = 11914251, upload-time = "2026-03-05T20:06:22.887Z" }, + { url = "https://files.pythonhosted.org/packages/77/46/0f7c865c10cf896ccf5a939c3e84e1cfaeed608ff5249584799a74d33835/ruff-0.15.5-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:1cc6e7f90087e2d27f98dc34ed1b3ab7c8f0d273cc5431415454e22c0bd2a681", size = 11333801, upload-time = "2026-03-05T20:05:57.168Z" }, + { url = "https://files.pythonhosted.org/packages/d3/01/a10fe54b653061585e655f5286c2662ebddb68831ed3eaebfb0eb08c0a16/ruff-0.15.5-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c1cb7169f53c1ddb06e71a9aebd7e98fc0fea936b39afb36d8e86d36ecc2636a", size = 11206821, upload-time = "2026-03-05T20:06:03.441Z" }, + { url = "https://files.pythonhosted.org/packages/7a/0d/2132ceaf20c5e8699aa83da2706ecb5c5dcdf78b453f77edca7fb70f8a93/ruff-0.15.5-py3-none-manylinux_2_31_riscv64.whl", hash = "sha256:9b037924500a31ee17389b5c8c4d88874cc6ea8e42f12e9c61a3d754ff72f1ca", size = 11133326, upload-time = "2026-03-05T20:06:25.655Z" }, + { url = "https://files.pythonhosted.org/packages/72/cb/2e5259a7eb2a0f87c08c0fe5bf5825a1e4b90883a52685524596bfc93072/ruff-0.15.5-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:65bb414e5b4eadd95a8c1e4804f6772bbe8995889f203a01f77ddf2d790929dd", size = 10510820, upload-time = "2026-03-05T20:06:37.79Z" }, + { url = "https://files.pythonhosted.org/packages/ff/20/b67ce78f9e6c59ffbdb5b4503d0090e749b5f2d31b599b554698a80d861c/ruff-0.15.5-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:d20aa469ae3b57033519c559e9bc9cd9e782842e39be05b50e852c7c981fa01d", size = 10302395, upload-time = "2026-03-05T20:05:54.504Z" }, + { url = "https://files.pythonhosted.org/packages/5f/e5/719f1acccd31b720d477751558ed74e9c88134adcc377e5e886af89d3072/ruff-0.15.5-py3-none-musllinux_1_2_i686.whl", hash = "sha256:15388dd28c9161cdb8eda68993533acc870aa4e646a0a277aa166de9ad5a8752", size = 10754069, upload-time = "2026-03-05T20:06:06.422Z" }, + { url = "https://files.pythonhosted.org/packages/c3/9c/d1db14469e32d98f3ca27079dbd30b7b44dbb5317d06ab36718dee3baf03/ruff-0.15.5-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:b30da330cbd03bed0c21420b6b953158f60c74c54c5f4c1dabbdf3a57bf355d2", size = 11304315, upload-time = "2026-03-05T20:06:10.867Z" }, + { url = "https://files.pythonhosted.org/packages/28/3a/950367aee7c69027f4f422059227b290ed780366b6aecee5de5039d50fa8/ruff-0.15.5-py3-none-win32.whl", hash = "sha256:732e5ee1f98ba5b3679029989a06ca39a950cced52143a0ea82a2102cb592b74", size = 10551676, upload-time = "2026-03-05T20:06:13.705Z" }, + { url = "https://files.pythonhosted.org/packages/b8/00/bf077a505b4e649bdd3c47ff8ec967735ce2544c8e4a43aba42ee9bf935d/ruff-0.15.5-py3-none-win_amd64.whl", hash = "sha256:821d41c5fa9e19117616c35eaa3f4b75046ec76c65e7ae20a333e9a8696bc7fe", size = 11678972, upload-time = "2026-03-05T20:06:45.379Z" }, + { url = "https://files.pythonhosted.org/packages/fe/4e/cd76eca6db6115604b7626668e891c9dd03330384082e33662fb0f113614/ruff-0.15.5-py3-none-win_arm64.whl", hash = "sha256:b498d1c60d2fe5c10c45ec3f698901065772730b411f164ae270bb6bfcc4740b", size = 10965572, upload-time = "2026-03-05T20:06:16.984Z" }, +] + [[package]] name = "setuptools" version = "80.4.0"