Files
project_for_kamlesh/tests/test_csv_grok.py
Vyaas 18a3b464f2 🎉 Initial commit: project scaffold, agent modes, and gnarly_csv example
- Add .roomodes with four custom Roo agents:
    • excalidraw-to-python  – diagram → typed Python skeleton
    • python-coder          – skeleton → production implementation (design patterns)
    • tester                – pytest suite writer & runner
    • orchestrator          – coordinates the full excalidraw→code→test→execute pipeline
- Add src/csv_grok.py and tests/test_csv_grok.py (CSV diff utility)
- Add examples/gnarly_csv/ with gnarly_csv_delta.py and sample data (rev_a/rev_b)
- Add drawings/ with design.excalidraw and gnarly_csv_files.excalidraw
- Add docs/excalidraw-to-python-agent.md
- Add requirements.txt and .gitignore
2026-04-11 15:54:42 -07:00

305 lines
12 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
Unit tests for src/csv_grok.py
================================
Every numeric assertion is verified by an independent second method so
the test itself is trustworthy and not just a tautology.
Independent verification strategy
-----------------------------------
- Means are cross-checked via sum(values) / len(values) computed
directly from the raw fixture lists — no pandas involved in the
reference calculation.
- Combined row count is verified by simple integer addition.
- File content checks use plain string search, not the module's own
formatting helpers.
"""
from __future__ import annotations
import math
from pathlib import Path
import pandas as pd
import pytest
# Make src/ importable without installing the package.
import sys
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
from csv_grok import (
AnalysisResult,
CsvBundle,
analyse,
load_and_combine,
load_csv,
write_report,
)
# ---------------------------------------------------------------------------
# Paths to the shared fixture CSVs
# ---------------------------------------------------------------------------
DATA_DIR = Path(__file__).parent.parent / "data"
FILE1 = DATA_DIR / "file1.csv"
FILE2 = DATA_DIR / "file2.csv"
# ---------------------------------------------------------------------------
# Ground-truth values derived by hand from the fixture files
# (independent of pandas used as the reference in assertions)
# ---------------------------------------------------------------------------
# file1.csv score column: Alice=88.5, Bob=91.0, Carol=76.3, Dave=83.7, Eve=95.2
FILE1_SCORES = [88.5, 91.0, 76.3, 83.7, 95.2]
FILE1_SCORE_MEAN_REF = sum(FILE1_SCORES) / len(FILE1_SCORES) # 86.94
# file2.csv score column: Frank=70.1, Grace=88.9, Hank=65.4, Iris=79.8, Jack=82.3
FILE2_SCORES = [70.1, 88.9, 65.4, 79.8, 82.3]
FILE2_SCORE_MEAN_REF = sum(FILE2_SCORES) / len(FILE2_SCORES) # 77.30
# file1.csv salary column
FILE1_SALARIES = [72000, 65000, 85000, 70000, 90000]
FILE1_SALARY_MEAN_REF = sum(FILE1_SALARIES) / len(FILE1_SALARIES) # 76400.0
# file2.csv salary column
FILE2_SALARIES = [95000, 58000, 110000, 68000, 88000]
FILE2_SALARY_MEAN_REF = sum(FILE2_SALARIES) / len(FILE2_SALARIES) # 83800.0
# ===========================================================================
# Tests for load_csv()
# ===========================================================================
class TestLoadCsv:
"""Happy-path and structural checks for load_csv()."""
def test_returns_csv_bundle(self):
"""load_csv() must return a CsvBundle instance."""
bundle = load_csv(FILE1)
assert isinstance(bundle, CsvBundle)
def test_path_attribute_preserved(self):
"""The bundle's .path must equal the path that was passed in."""
bundle = load_csv(FILE1)
assert bundle.path == FILE1
def test_row_count_file1(self):
"""file1.csv has 5 data rows."""
bundle = load_csv(FILE1)
assert len(bundle.data) == 5
def test_row_count_file2(self):
"""file2.csv has 5 data rows."""
bundle = load_csv(FILE2)
assert len(bundle.data) == 5
def test_column_count(self):
"""Both files have exactly 4 columns."""
for path in (FILE1, FILE2):
bundle = load_csv(path)
assert len(bundle.data.columns) == 4, (
f"{path.name} should have 4 columns, got {list(bundle.data.columns)}"
)
def test_expected_columns_present(self):
"""Columns name, age, score, salary must all be present."""
expected = {"name", "age", "score", "salary"}
for path in (FILE1, FILE2):
bundle = load_csv(path)
assert expected == set(bundle.data.columns)
def test_data_is_dataframe(self):
"""bundle.data must be a pandas DataFrame."""
bundle = load_csv(FILE1)
assert isinstance(bundle.data, pd.DataFrame)
def test_missing_file_raises(self, tmp_path):
"""load_csv() must raise when the file does not exist."""
with pytest.raises(Exception):
load_csv(tmp_path / "nonexistent.csv")
# ===========================================================================
# Tests for load_and_combine()
# ===========================================================================
class TestLoadAndCombine:
"""Tests for the Combine (union) pipeline step."""
def test_returns_three_tuple(self):
"""load_and_combine() must return a 3-tuple."""
result = load_and_combine(FILE1, FILE2)
assert len(result) == 3
def test_bundles_are_csv_bundles(self):
"""First two elements of the tuple must be CsvBundle instances."""
b1, b2, _ = load_and_combine(FILE1, FILE2)
assert isinstance(b1, CsvBundle)
assert isinstance(b2, CsvBundle)
def test_combined_is_dataframe(self):
"""Third element must be a pandas DataFrame."""
_, _, combined = load_and_combine(FILE1, FILE2)
assert isinstance(combined, pd.DataFrame)
def test_combined_row_count(self):
"""Combined DataFrame must have len(file1) + len(file2) rows.
Independent check: load each file separately with pd.read_csv and
add their lengths — no call to load_and_combine() in the reference.
"""
b1, b2, combined = load_and_combine(FILE1, FILE2)
# Reference: independent row counts via direct pd.read_csv
ref_rows = len(pd.read_csv(FILE1)) + len(pd.read_csv(FILE2))
assert len(combined) == ref_rows
assert len(combined) == len(b1.data) + len(b2.data)
def test_combined_preserves_columns(self):
"""Combined DataFrame must retain all 4 original columns."""
_, _, combined = load_and_combine(FILE1, FILE2)
assert set(combined.columns) == {"name", "age", "score", "salary"}
def test_combined_index_is_reset(self):
"""Combined DataFrame index must be 0-based and contiguous."""
_, _, combined = load_and_combine(FILE1, FILE2)
expected_index = list(range(len(combined)))
assert list(combined.index) == expected_index
def test_bundle_paths_are_correct(self):
"""Each bundle must carry the path it was loaded from."""
b1, b2, _ = load_and_combine(FILE1, FILE2)
assert b1.path == FILE1
assert b2.path == FILE2
# ===========================================================================
# Tests for analyse()
# ===========================================================================
class TestAnalyse:
"""Tests for the Analyse pipeline step — means and difference."""
@pytest.fixture(autouse=True)
def _bundles(self):
self.b1, self.b2, _ = load_and_combine(FILE1, FILE2)
# --- happy-path: score column -------------------------------------------
def test_returns_analysis_result(self):
"""analyse() must return an AnalysisResult instance."""
result = analyse(self.b1, self.b2, "score")
assert isinstance(result, AnalysisResult)
def test_column_attribute(self):
"""result.column must equal the column name passed in."""
result = analyse(self.b1, self.b2, "score")
assert result.column == "score"
def test_mean_file1_score(self):
"""mean_file1 for 'score' must match the hand-computed reference.
Reference: sum(FILE1_SCORES) / len(FILE1_SCORES) — no pandas.
"""
result = analyse(self.b1, self.b2, "score")
assert math.isclose(result.mean_file1, FILE1_SCORE_MEAN_REF, rel_tol=1e-9)
def test_mean_file2_score(self):
"""mean_file2 for 'score' must match the hand-computed reference."""
result = analyse(self.b1, self.b2, "score")
assert math.isclose(result.mean_file2, FILE2_SCORE_MEAN_REF, rel_tol=1e-9)
def test_difference_score(self):
"""difference must equal mean_file1 mean_file2 (verified independently)."""
result = analyse(self.b1, self.b2, "score")
expected_diff = FILE1_SCORE_MEAN_REF - FILE2_SCORE_MEAN_REF
assert math.isclose(result.difference, expected_diff, rel_tol=1e-9)
# Also verify the internal consistency of the dataclass fields
assert math.isclose(result.difference, result.mean_file1 - result.mean_file2, rel_tol=1e-9)
# --- happy-path: salary column ------------------------------------------
def test_mean_file1_salary(self):
"""mean_file1 for 'salary' must match the hand-computed reference."""
result = analyse(self.b1, self.b2, "salary")
assert math.isclose(result.mean_file1, FILE1_SALARY_MEAN_REF, rel_tol=1e-9)
def test_mean_file2_salary(self):
"""mean_file2 for 'salary' must match the hand-computed reference."""
result = analyse(self.b1, self.b2, "salary")
assert math.isclose(result.mean_file2, FILE2_SALARY_MEAN_REF, rel_tol=1e-9)
def test_difference_salary(self):
"""difference for 'salary' must equal mean_file1 mean_file2."""
result = analyse(self.b1, self.b2, "salary")
expected_diff = FILE1_SALARY_MEAN_REF - FILE2_SALARY_MEAN_REF
assert math.isclose(result.difference, expected_diff, rel_tol=1e-9)
# --- error path ---------------------------------------------------------
def test_missing_column_raises_value_error(self):
"""analyse() must raise ValueError for a column that does not exist."""
with pytest.raises(ValueError, match="not found"):
analyse(self.b1, self.b2, "nonexistent_column")
def test_error_message_contains_column_name(self):
"""The ValueError message must name the missing column."""
bad_col = "ghost_column"
with pytest.raises(ValueError, match=bad_col):
analyse(self.b1, self.b2, bad_col)
def test_non_numeric_column_raises_or_returns_nan(self):
"""Requesting the 'name' (string) column should either raise or return NaN mean."""
try:
result = analyse(self.b1, self.b2, "name")
# If it doesn't raise, the means must be NaN (pandas behaviour for strings)
assert math.isnan(result.mean_file1) or math.isnan(result.mean_file2)
except (TypeError, ValueError):
pass # raising is also acceptable
# ===========================================================================
# Tests for write_report()
# ===========================================================================
class TestWriteReport:
"""Tests for the report-writing step."""
@pytest.fixture()
def result(self):
b1, b2, _ = load_and_combine(FILE1, FILE2)
return analyse(b1, b2, "score")
def test_file_is_created(self, tmp_path, result):
"""write_report() must create the output file."""
out = tmp_path / "report.txt"
write_report(result, out)
assert out.exists()
def test_file_contains_column_name(self, tmp_path, result):
"""The report must mention the column that was analysed."""
out = tmp_path / "report.txt"
write_report(result, out)
assert "score" in out.read_text()
def test_file_contains_mean1(self, tmp_path, result):
"""The report must contain the mean of file 1 (to 4 decimal places)."""
out = tmp_path / "report.txt"
write_report(result, out)
# Independent reference: format the hand-computed value the same way
expected_str = f"{FILE1_SCORE_MEAN_REF:.4f}"
assert expected_str in out.read_text()
def test_file_contains_mean2(self, tmp_path, result):
"""The report must contain the mean of file 2 (to 4 decimal places)."""
out = tmp_path / "report.txt"
write_report(result, out)
expected_str = f"{FILE2_SCORE_MEAN_REF:.4f}"
assert expected_str in out.read_text()
def test_file_is_non_empty(self, tmp_path, result):
"""The report file must not be empty."""
out = tmp_path / "report.txt"
write_report(result, out)
assert out.stat().st_size > 0