Skip to content

Validation

cifflow.validation._validate

Unified validation facade for cifflow.

ValidationReport dataclass

Source code in src/cifflow/validation/_validate.py
41
42
43
44
45
@dataclass
class ValidationReport:
    passed:   bool
    issues:   list[ValidationIssue]
    database: duckdb.DuckDBPyConnection | None

ValidationIssue dataclass

Source code in src/cifflow/validation/_validate.py
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
@dataclass
class ValidationIssue:
    stage:      Literal['parse', 'ingest', 'database']
    severity:   Literal['Error', 'Warning', 'Info']
    check:      str
    message:    str
    block:      str | None
    tag:        str | None
    value:      str | None
    line:       int | None
    col:        int | None
    table:      str | None
    column:     str | None
    row_id:     int | None
    key_values: dict[str, str | None] | None

validate(source, schema=None, *, parse_errors=None, block_id=None, dataset_id=None, propagate_fk=False)

Parse (if needed), ingest to an in-memory database, and validate against the schema.

Returns a unified :class:ValidationReport. Never raises.

Source code in src/cifflow/validation/_validate.py
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
def validate(
    source: str | pathlib.Path | CifFile,
    schema: SchemaSpec | None = None,
    *,
    parse_errors: list[ParseError] | None = None,
    block_id: str | None = None,
    dataset_id: str | None = None,
    propagate_fk: bool = False,
) -> ValidationReport:
    """Parse (if needed), ingest to an in-memory database, and validate against the schema.

    Returns a unified :class:`ValidationReport`.  Never raises.
    """
    issues: list[ValidationIssue] = []

    # ------------------------------------------------------------------ #
    # Stage 1 — Parse                                                      #
    # ------------------------------------------------------------------ #
    try:
        if isinstance(source, CifFile):
            cif = source
            if parse_errors is not None:
                for err in parse_errors:
                    issues.append(_parse_error_to_issue(err))
        else:
            if parse_errors is not None:
                warnings.warn(
                    "parse_errors is ignored when source is a str or Path; "
                    "errors are collected internally from build()",
                    UserWarning,
                    stacklevel=2,
                )
            src_str = (
                pathlib.Path(source).read_text(encoding='utf-8')
                if isinstance(source, pathlib.Path)
                else source
            )
            cif, raw_errors = build(src_str)
            for err in raw_errors:
                issues.append(_parse_error_to_issue(err))
    except Exception as exc:
        issues.append(ValidationIssue(
            stage='parse', severity='Error', check='internal_error',
            message=str(exc),
            block=None, tag=None, value=None,
            line=None, col=None,
            table=None, column=None, row_id=None, key_values=None,
        ))
        return ValidationReport(
            passed=not any(i.severity == 'Error' for i in issues),
            issues=issues,
            database=None,
        )

    # ------------------------------------------------------------------ #
    # Stage 2 — Ingest                                                     #
    # ------------------------------------------------------------------ #
    if not cif.blocks:
        return ValidationReport(
            passed=not any(i.severity == 'Error' for i in issues),
            issues=issues,
            database=None,
        )

    ingest_ok = False
    db: duckdb.DuckDBPyConnection | None = None
    try:
        db, ingest_errors = ingest(
            cif, schema=schema,
            dataset_id=dataset_id,
            propagate_fk=propagate_fk,
        )
        for msg in ingest_errors:
            issues.append(_ingest_msg_to_issue(msg, 'Warning'))
        ingest_ok = True

    except ValueError as exc:
        issues.append(_ingest_exc_to_issue('dataset_error', str(exc)))

    except Exception as exc:
        issues.append(_ingest_exc_to_issue('internal_error', str(exc)))

    # ------------------------------------------------------------------ #
    # Stage 3 — Database                                                   #
    # ------------------------------------------------------------------ #
    if ingest_ok and schema is not None and db is not None:
        db_results = validate_database(db, schema, block_id=block_id, strict_container_nulls=True)
        has_internal_error = any(r.check == 'internal_error' for r in db_results)
        for r in db_results:
            if r.check == 'internal_error':
                issues.append(ValidationIssue(
                    stage='database', severity='Error', check='internal_error',
                    message=r.message,
                    block=None, tag=None, value=None,
                    line=None, col=None,
                    table=None, column=None, row_id=None, key_values=None,
                ))
            else:
                issues.append(_db_result_to_issue(r))
        if has_internal_error:
            db = None

    return ValidationReport(
        passed=not any(i.severity == 'Error' for i in issues),
        issues=issues,
        database=db,
    )

cifflow.validation._db_validate

Database-stage validation for cifflow.

DbValidationResult dataclass

Source code in src/cifflow/validation/_db_validate.py
27
28
29
30
31
32
33
34
35
36
37
38
@dataclass
class DbValidationResult:
    table:      str
    column:     str
    tag:        str
    block_id:   str
    row_id:     int
    key_values: dict[str, str | None]
    value:      str
    check:      str
    severity:   Literal['Error', 'Warning']
    message:    str

validate_database(db, schema, *, block_id=None, strict_container_nulls=True)

Validate a DuckDB database against a schema.

Never raises; unexpected exceptions are returned as 'internal_error' results.

Source code in src/cifflow/validation/_db_validate.py
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
def validate_database(
    db: duckdb.DuckDBPyConnection,
    schema: SchemaSpec,
    *,
    block_id: str | None = None,
    strict_container_nulls: bool = True,
) -> list[DbValidationResult]:
    """
    Validate a DuckDB database against a schema.

    Never raises; unexpected exceptions are returned as 'internal_error' results.
    """
    results: list[DbValidationResult] = []
    try:
        _run_validation(db, schema, block_id, strict_container_nulls, results)
    except Exception as exc:
        results.append(DbValidationResult(
            table='', column='', tag='', block_id='', row_id=0,
            key_values={}, value='',
            check='internal_error', severity='Error',
            message=str(exc),
        ))
    return results