305 lines
12 KiB
Python
305 lines
12 KiB
Python
|
|
"""
|
|||
|
|
Unit tests for src/csv_grok.py
|
|||
|
|
================================
|
|||
|
|
Every numeric assertion is verified by an independent second method so
|
|||
|
|
the test itself is trustworthy and not just a tautology.
|
|||
|
|
|
|||
|
|
Independent verification strategy
|
|||
|
|
-----------------------------------
|
|||
|
|
- Means are cross-checked via sum(values) / len(values) computed
|
|||
|
|
directly from the raw fixture lists — no pandas involved in the
|
|||
|
|
reference calculation.
|
|||
|
|
- Combined row count is verified by simple integer addition.
|
|||
|
|
- File content checks use plain string search, not the module's own
|
|||
|
|
formatting helpers.
|
|||
|
|
"""
|
|||
|
|
|
|||
|
|
from __future__ import annotations
|
|||
|
|
|
|||
|
|
import math
|
|||
|
|
from pathlib import Path
|
|||
|
|
|
|||
|
|
import pandas as pd
|
|||
|
|
import pytest
|
|||
|
|
|
|||
|
|
# Make src/ importable without installing the package.
|
|||
|
|
import sys
|
|||
|
|
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
|
|||
|
|
|
|||
|
|
from csv_grok import (
|
|||
|
|
AnalysisResult,
|
|||
|
|
CsvBundle,
|
|||
|
|
analyse,
|
|||
|
|
load_and_combine,
|
|||
|
|
load_csv,
|
|||
|
|
write_report,
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
# ---------------------------------------------------------------------------
|
|||
|
|
# Paths to the shared fixture CSVs
|
|||
|
|
# ---------------------------------------------------------------------------
|
|||
|
|
|
|||
|
|
DATA_DIR = Path(__file__).parent.parent / "data"
|
|||
|
|
FILE1 = DATA_DIR / "file1.csv"
|
|||
|
|
FILE2 = DATA_DIR / "file2.csv"
|
|||
|
|
|
|||
|
|
# ---------------------------------------------------------------------------
|
|||
|
|
# Ground-truth values derived by hand from the fixture files
|
|||
|
|
# (independent of pandas – used as the reference in assertions)
|
|||
|
|
# ---------------------------------------------------------------------------
|
|||
|
|
|
|||
|
|
# file1.csv score column: Alice=88.5, Bob=91.0, Carol=76.3, Dave=83.7, Eve=95.2
|
|||
|
|
FILE1_SCORES = [88.5, 91.0, 76.3, 83.7, 95.2]
|
|||
|
|
FILE1_SCORE_MEAN_REF = sum(FILE1_SCORES) / len(FILE1_SCORES) # 86.94
|
|||
|
|
|
|||
|
|
# file2.csv score column: Frank=70.1, Grace=88.9, Hank=65.4, Iris=79.8, Jack=82.3
|
|||
|
|
FILE2_SCORES = [70.1, 88.9, 65.4, 79.8, 82.3]
|
|||
|
|
FILE2_SCORE_MEAN_REF = sum(FILE2_SCORES) / len(FILE2_SCORES) # 77.30
|
|||
|
|
|
|||
|
|
# file1.csv salary column
|
|||
|
|
FILE1_SALARIES = [72000, 65000, 85000, 70000, 90000]
|
|||
|
|
FILE1_SALARY_MEAN_REF = sum(FILE1_SALARIES) / len(FILE1_SALARIES) # 76400.0
|
|||
|
|
|
|||
|
|
# file2.csv salary column
|
|||
|
|
FILE2_SALARIES = [95000, 58000, 110000, 68000, 88000]
|
|||
|
|
FILE2_SALARY_MEAN_REF = sum(FILE2_SALARIES) / len(FILE2_SALARIES) # 83800.0
|
|||
|
|
|
|||
|
|
|
|||
|
|
# ===========================================================================
|
|||
|
|
# Tests for load_csv()
|
|||
|
|
# ===========================================================================
|
|||
|
|
|
|||
|
|
class TestLoadCsv:
|
|||
|
|
"""Happy-path and structural checks for load_csv()."""
|
|||
|
|
|
|||
|
|
def test_returns_csv_bundle(self):
|
|||
|
|
"""load_csv() must return a CsvBundle instance."""
|
|||
|
|
bundle = load_csv(FILE1)
|
|||
|
|
assert isinstance(bundle, CsvBundle)
|
|||
|
|
|
|||
|
|
def test_path_attribute_preserved(self):
|
|||
|
|
"""The bundle's .path must equal the path that was passed in."""
|
|||
|
|
bundle = load_csv(FILE1)
|
|||
|
|
assert bundle.path == FILE1
|
|||
|
|
|
|||
|
|
def test_row_count_file1(self):
|
|||
|
|
"""file1.csv has 5 data rows."""
|
|||
|
|
bundle = load_csv(FILE1)
|
|||
|
|
assert len(bundle.data) == 5
|
|||
|
|
|
|||
|
|
def test_row_count_file2(self):
|
|||
|
|
"""file2.csv has 5 data rows."""
|
|||
|
|
bundle = load_csv(FILE2)
|
|||
|
|
assert len(bundle.data) == 5
|
|||
|
|
|
|||
|
|
def test_column_count(self):
|
|||
|
|
"""Both files have exactly 4 columns."""
|
|||
|
|
for path in (FILE1, FILE2):
|
|||
|
|
bundle = load_csv(path)
|
|||
|
|
assert len(bundle.data.columns) == 4, (
|
|||
|
|
f"{path.name} should have 4 columns, got {list(bundle.data.columns)}"
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
def test_expected_columns_present(self):
|
|||
|
|
"""Columns name, age, score, salary must all be present."""
|
|||
|
|
expected = {"name", "age", "score", "salary"}
|
|||
|
|
for path in (FILE1, FILE2):
|
|||
|
|
bundle = load_csv(path)
|
|||
|
|
assert expected == set(bundle.data.columns)
|
|||
|
|
|
|||
|
|
def test_data_is_dataframe(self):
|
|||
|
|
"""bundle.data must be a pandas DataFrame."""
|
|||
|
|
bundle = load_csv(FILE1)
|
|||
|
|
assert isinstance(bundle.data, pd.DataFrame)
|
|||
|
|
|
|||
|
|
def test_missing_file_raises(self, tmp_path):
|
|||
|
|
"""load_csv() must raise when the file does not exist."""
|
|||
|
|
with pytest.raises(Exception):
|
|||
|
|
load_csv(tmp_path / "nonexistent.csv")
|
|||
|
|
|
|||
|
|
|
|||
|
|
# ===========================================================================
|
|||
|
|
# Tests for load_and_combine()
|
|||
|
|
# ===========================================================================
|
|||
|
|
|
|||
|
|
class TestLoadAndCombine:
|
|||
|
|
"""Tests for the Combine (union) pipeline step."""
|
|||
|
|
|
|||
|
|
def test_returns_three_tuple(self):
|
|||
|
|
"""load_and_combine() must return a 3-tuple."""
|
|||
|
|
result = load_and_combine(FILE1, FILE2)
|
|||
|
|
assert len(result) == 3
|
|||
|
|
|
|||
|
|
def test_bundles_are_csv_bundles(self):
|
|||
|
|
"""First two elements of the tuple must be CsvBundle instances."""
|
|||
|
|
b1, b2, _ = load_and_combine(FILE1, FILE2)
|
|||
|
|
assert isinstance(b1, CsvBundle)
|
|||
|
|
assert isinstance(b2, CsvBundle)
|
|||
|
|
|
|||
|
|
def test_combined_is_dataframe(self):
|
|||
|
|
"""Third element must be a pandas DataFrame."""
|
|||
|
|
_, _, combined = load_and_combine(FILE1, FILE2)
|
|||
|
|
assert isinstance(combined, pd.DataFrame)
|
|||
|
|
|
|||
|
|
def test_combined_row_count(self):
|
|||
|
|
"""Combined DataFrame must have len(file1) + len(file2) rows.
|
|||
|
|
|
|||
|
|
Independent check: load each file separately with pd.read_csv and
|
|||
|
|
add their lengths — no call to load_and_combine() in the reference.
|
|||
|
|
"""
|
|||
|
|
b1, b2, combined = load_and_combine(FILE1, FILE2)
|
|||
|
|
|
|||
|
|
# Reference: independent row counts via direct pd.read_csv
|
|||
|
|
ref_rows = len(pd.read_csv(FILE1)) + len(pd.read_csv(FILE2))
|
|||
|
|
|
|||
|
|
assert len(combined) == ref_rows
|
|||
|
|
assert len(combined) == len(b1.data) + len(b2.data)
|
|||
|
|
|
|||
|
|
def test_combined_preserves_columns(self):
|
|||
|
|
"""Combined DataFrame must retain all 4 original columns."""
|
|||
|
|
_, _, combined = load_and_combine(FILE1, FILE2)
|
|||
|
|
assert set(combined.columns) == {"name", "age", "score", "salary"}
|
|||
|
|
|
|||
|
|
def test_combined_index_is_reset(self):
|
|||
|
|
"""Combined DataFrame index must be 0-based and contiguous."""
|
|||
|
|
_, _, combined = load_and_combine(FILE1, FILE2)
|
|||
|
|
expected_index = list(range(len(combined)))
|
|||
|
|
assert list(combined.index) == expected_index
|
|||
|
|
|
|||
|
|
def test_bundle_paths_are_correct(self):
|
|||
|
|
"""Each bundle must carry the path it was loaded from."""
|
|||
|
|
b1, b2, _ = load_and_combine(FILE1, FILE2)
|
|||
|
|
assert b1.path == FILE1
|
|||
|
|
assert b2.path == FILE2
|
|||
|
|
|
|||
|
|
|
|||
|
|
# ===========================================================================
|
|||
|
|
# Tests for analyse()
|
|||
|
|
# ===========================================================================
|
|||
|
|
|
|||
|
|
class TestAnalyse:
|
|||
|
|
"""Tests for the Analyse pipeline step — means and difference."""
|
|||
|
|
|
|||
|
|
@pytest.fixture(autouse=True)
|
|||
|
|
def _bundles(self):
|
|||
|
|
self.b1, self.b2, _ = load_and_combine(FILE1, FILE2)
|
|||
|
|
|
|||
|
|
# --- happy-path: score column -------------------------------------------
|
|||
|
|
|
|||
|
|
def test_returns_analysis_result(self):
|
|||
|
|
"""analyse() must return an AnalysisResult instance."""
|
|||
|
|
result = analyse(self.b1, self.b2, "score")
|
|||
|
|
assert isinstance(result, AnalysisResult)
|
|||
|
|
|
|||
|
|
def test_column_attribute(self):
|
|||
|
|
"""result.column must equal the column name passed in."""
|
|||
|
|
result = analyse(self.b1, self.b2, "score")
|
|||
|
|
assert result.column == "score"
|
|||
|
|
|
|||
|
|
def test_mean_file1_score(self):
|
|||
|
|
"""mean_file1 for 'score' must match the hand-computed reference.
|
|||
|
|
|
|||
|
|
Reference: sum(FILE1_SCORES) / len(FILE1_SCORES) — no pandas.
|
|||
|
|
"""
|
|||
|
|
result = analyse(self.b1, self.b2, "score")
|
|||
|
|
assert math.isclose(result.mean_file1, FILE1_SCORE_MEAN_REF, rel_tol=1e-9)
|
|||
|
|
|
|||
|
|
def test_mean_file2_score(self):
|
|||
|
|
"""mean_file2 for 'score' must match the hand-computed reference."""
|
|||
|
|
result = analyse(self.b1, self.b2, "score")
|
|||
|
|
assert math.isclose(result.mean_file2, FILE2_SCORE_MEAN_REF, rel_tol=1e-9)
|
|||
|
|
|
|||
|
|
def test_difference_score(self):
|
|||
|
|
"""difference must equal mean_file1 − mean_file2 (verified independently)."""
|
|||
|
|
result = analyse(self.b1, self.b2, "score")
|
|||
|
|
expected_diff = FILE1_SCORE_MEAN_REF - FILE2_SCORE_MEAN_REF
|
|||
|
|
assert math.isclose(result.difference, expected_diff, rel_tol=1e-9)
|
|||
|
|
# Also verify the internal consistency of the dataclass fields
|
|||
|
|
assert math.isclose(result.difference, result.mean_file1 - result.mean_file2, rel_tol=1e-9)
|
|||
|
|
|
|||
|
|
# --- happy-path: salary column ------------------------------------------
|
|||
|
|
|
|||
|
|
def test_mean_file1_salary(self):
|
|||
|
|
"""mean_file1 for 'salary' must match the hand-computed reference."""
|
|||
|
|
result = analyse(self.b1, self.b2, "salary")
|
|||
|
|
assert math.isclose(result.mean_file1, FILE1_SALARY_MEAN_REF, rel_tol=1e-9)
|
|||
|
|
|
|||
|
|
def test_mean_file2_salary(self):
|
|||
|
|
"""mean_file2 for 'salary' must match the hand-computed reference."""
|
|||
|
|
result = analyse(self.b1, self.b2, "salary")
|
|||
|
|
assert math.isclose(result.mean_file2, FILE2_SALARY_MEAN_REF, rel_tol=1e-9)
|
|||
|
|
|
|||
|
|
def test_difference_salary(self):
|
|||
|
|
"""difference for 'salary' must equal mean_file1 − mean_file2."""
|
|||
|
|
result = analyse(self.b1, self.b2, "salary")
|
|||
|
|
expected_diff = FILE1_SALARY_MEAN_REF - FILE2_SALARY_MEAN_REF
|
|||
|
|
assert math.isclose(result.difference, expected_diff, rel_tol=1e-9)
|
|||
|
|
|
|||
|
|
# --- error path ---------------------------------------------------------
|
|||
|
|
|
|||
|
|
def test_missing_column_raises_value_error(self):
|
|||
|
|
"""analyse() must raise ValueError for a column that does not exist."""
|
|||
|
|
with pytest.raises(ValueError, match="not found"):
|
|||
|
|
analyse(self.b1, self.b2, "nonexistent_column")
|
|||
|
|
|
|||
|
|
def test_error_message_contains_column_name(self):
|
|||
|
|
"""The ValueError message must name the missing column."""
|
|||
|
|
bad_col = "ghost_column"
|
|||
|
|
with pytest.raises(ValueError, match=bad_col):
|
|||
|
|
analyse(self.b1, self.b2, bad_col)
|
|||
|
|
|
|||
|
|
def test_non_numeric_column_raises_or_returns_nan(self):
|
|||
|
|
"""Requesting the 'name' (string) column should either raise or return NaN mean."""
|
|||
|
|
try:
|
|||
|
|
result = analyse(self.b1, self.b2, "name")
|
|||
|
|
# If it doesn't raise, the means must be NaN (pandas behaviour for strings)
|
|||
|
|
assert math.isnan(result.mean_file1) or math.isnan(result.mean_file2)
|
|||
|
|
except (TypeError, ValueError):
|
|||
|
|
pass # raising is also acceptable
|
|||
|
|
|
|||
|
|
|
|||
|
|
# ===========================================================================
|
|||
|
|
# Tests for write_report()
|
|||
|
|
# ===========================================================================
|
|||
|
|
|
|||
|
|
class TestWriteReport:
|
|||
|
|
"""Tests for the report-writing step."""
|
|||
|
|
|
|||
|
|
@pytest.fixture()
|
|||
|
|
def result(self):
|
|||
|
|
b1, b2, _ = load_and_combine(FILE1, FILE2)
|
|||
|
|
return analyse(b1, b2, "score")
|
|||
|
|
|
|||
|
|
def test_file_is_created(self, tmp_path, result):
|
|||
|
|
"""write_report() must create the output file."""
|
|||
|
|
out = tmp_path / "report.txt"
|
|||
|
|
write_report(result, out)
|
|||
|
|
assert out.exists()
|
|||
|
|
|
|||
|
|
def test_file_contains_column_name(self, tmp_path, result):
|
|||
|
|
"""The report must mention the column that was analysed."""
|
|||
|
|
out = tmp_path / "report.txt"
|
|||
|
|
write_report(result, out)
|
|||
|
|
assert "score" in out.read_text()
|
|||
|
|
|
|||
|
|
def test_file_contains_mean1(self, tmp_path, result):
|
|||
|
|
"""The report must contain the mean of file 1 (to 4 decimal places)."""
|
|||
|
|
out = tmp_path / "report.txt"
|
|||
|
|
write_report(result, out)
|
|||
|
|
# Independent reference: format the hand-computed value the same way
|
|||
|
|
expected_str = f"{FILE1_SCORE_MEAN_REF:.4f}"
|
|||
|
|
assert expected_str in out.read_text()
|
|||
|
|
|
|||
|
|
def test_file_contains_mean2(self, tmp_path, result):
|
|||
|
|
"""The report must contain the mean of file 2 (to 4 decimal places)."""
|
|||
|
|
out = tmp_path / "report.txt"
|
|||
|
|
write_report(result, out)
|
|||
|
|
expected_str = f"{FILE2_SCORE_MEAN_REF:.4f}"
|
|||
|
|
assert expected_str in out.read_text()
|
|||
|
|
|
|||
|
|
def test_file_is_non_empty(self, tmp_path, result):
|
|||
|
|
"""The report file must not be empty."""
|
|||
|
|
out = tmp_path / "report.txt"
|
|||
|
|
write_report(result, out)
|
|||
|
|
assert out.stat().st_size > 0
|