Ingestion

`cifflow.ingestion.ingest`

DuckDB ingestion for CifFile objects.

`IngestionError`

Bases: Exception

Raised when one or more semantic errors prevent successful ingestion.

Parameters:

Name	Type	Description	Default
`errors`	`list[str]`	Ordered list of error message strings.	required

Attributes:

Name	Type	Description
`errors`	`list[str]`	Ordered list of error message strings.

Source code in src/cifflow/ingestion/ingest.py

class IngestionError(Exception):
    """Raised when one or more semantic errors prevent successful ingestion.

    Parameters
    ----------
    errors
        Ordered list of error message strings.

    Attributes
    ----------
    errors
        Ordered list of error message strings.
    """

    errors: list[str]

    def __init__(self, errors: list[str]) -> None:
        self.errors = errors
        summary = errors[0] if errors else '(no details)'
        extra = f' (and {len(errors) - 1} more)' if len(errors) > 1 else ''
        super().__init__(f'{len(errors)} semantic error(s): {summary}{extra}')

`ingest(cif, db=None, schema=None, *, propagate_fk=False, dataset_id=None)`

Ingest a parsed CifFile into a DuckDB database.

Parameters:

Name	Type	Description	Default
`cif`	`CifFile`	Parsed CifFile from build(). May contain one or more blocks.	required
`db`	`DuckDBPyConnection \| str \| Path \| None`	DuckDB connection target. None (default) creates an in-memory connection. str or Path opens (or creates) a file-backed database. An existing DuckDBPyConnection is used directly (caller retains ownership).	`None`
`schema`	`SchemaSpec \| None`	SchemaSpec used to route tags to structured tables. If None, all tags are routed to _cif_fallback.	`None`
`propagate_fk`	`bool`	When True, non-key FK columns absent from the CIF data inherit their value from the FK target already known in the same block.	`False`
`dataset_id`	`str \| None`	The _audit_dataset.id value to ingest. When None, auto-detected. Raises `ValueError` if specified but not found in any dataset block, or if None and the file contains blocks belonging to incompatible datasets (no common `_audit_dataset.id`).	`None`

Returns:

Type	Description
`tuple[DuckDBPyConnection, list[str]]`	The DuckDB connection and a list of error/warning strings.

Source code in src/cifflow/ingestion/ingest.py

def ingest(
    cif: CifFile,
    db: duckdb.DuckDBPyConnection | str | pathlib.Path | None = None,
    schema: SchemaSpec | None = None,
    *,
    propagate_fk: bool = False,
    dataset_id: str | None = None,
) -> tuple[duckdb.DuckDBPyConnection, list[str]]:
    """Ingest a parsed CifFile into a DuckDB database.

    Parameters
    ----------
    cif:
        Parsed CifFile from build(). May contain one or more blocks.
    db:
        DuckDB connection target. None (default) creates an in-memory connection.
        str or Path opens (or creates) a file-backed database. An existing
        DuckDBPyConnection is used directly (caller retains ownership).
    schema:
        SchemaSpec used to route tags to structured tables. If None, all tags
        are routed to _cif_fallback.
    propagate_fk:
        When True, non-key FK columns absent from the CIF data inherit their
        value from the FK target already known in the same block.
    dataset_id:
        The _audit_dataset.id value to ingest. When None, auto-detected.
        Raises ``ValueError`` if specified but not found in any dataset block,
        or if None and the file contains blocks belonging to incompatible
        datasets (no common ``_audit_dataset.id``).

    Returns
    -------
    tuple[duckdb.DuckDBPyConnection, list[str]]
        The DuckDB connection and a list of error/warning strings.
    """
    db = _resolve_db(db)
    errors: list[str] = []

    def emit(msg: str, **kw: Any) -> None:
        errors.append(msg)

    blocks = _select_blocks(cif, dataset_id)

    tag_to_column = build_tag_to_column(schema) if schema else {}
    su_map = build_su_map(schema) if schema else {}
    fallback_rows: list[dict] = []

    if schema is not None:
        setup_duckdb(schema, db)
        populated: set[str] = set()
        global_batch: dict[str, list[tuple]] = {}
        all_loop_group_entries: list[tuple] = []
        for position, block in enumerate(blocks):
            fallback, table_batch, blk_entries = load_block_data(
                block, block.name, position, schema, tag_to_column, su_map,
                set(), emit,
            )
            fallback_rows.extend(fallback)
            all_loop_group_entries.extend(blk_entries)
            for tbl, rows in table_batch.items():
                if tbl in global_batch:
                    global_batch[tbl].extend(rows)
                else:
                    global_batch[tbl] = rows
        flush_table_batches(db, global_batch, populated)
        if all_loop_group_entries:
            db.executemany(
                'INSERT INTO "_loop_groups" ("_cifflow_block_id", "table_name", "loop_id", "min_row_id") '
                'VALUES (?, ?, ?, ?) ON CONFLICT DO NOTHING',
                all_loop_group_entries,
            )
        propagate_fk_sql(db, schema, tag_to_column, propagate_fk, emit, populated)
        create_final_tables(db, schema, populated, errors)
        id_regimes = _compute_id_regimes(db, schema, [b.name for b in blocks], populated)
    else:
        _create_infrastructure_tables(db)
        id_regimes: dict[str, str] = {}
        row_id_counters: dict[str, int] = {}
        loop_id_counter = 1
        for block in blocks:
            loop_id_counter = _process_block_no_schema(
                block, block.name, row_id_counters, loop_id_counter, fallback_rows,
            )

    block_order_rows = [(block.name, i) for i, block in enumerate(blocks)]
    membership_rows: list[tuple] = []
    for block in blocks:
        dataset_ids = _read_dataset_ids(block)
        if dataset_ids:
            for did in sorted(dataset_ids):
                membership_rows.append((block.name, did, 'dataset'))
        else:
            regime = id_regimes.get(block.name, 'assumed')
            membership_rows.append((block.name, '', regime))

    validation_rows: list[tuple] = []
    for bid, did, regime in membership_rows:
        if regime == 'assumed' and did == '':
            validation_rows.append((
                'uuid_regime', 'Warning', bid,
                f"general block '{bid}' has non-UUID PK values "
                f"(or no structured rows); assumed coherence",
                'assumed',
            ))

    _flush_infra(db, fallback_rows, block_order_rows, membership_rows, validation_rows)

    return db, errors