- Add .roomodes with four custom Roo agents:
• excalidraw-to-python – diagram → typed Python skeleton
• python-coder – skeleton → production implementation (design patterns)
• tester – pytest suite writer & runner
• orchestrator – coordinates the full excalidraw→code→test→execute pipeline
- Add src/csv_grok.py and tests/test_csv_grok.py (CSV diff utility)
- Add examples/gnarly_csv/ with gnarly_csv_delta.py and sample data (rev_a/rev_b)
- Add drawings/ with design.excalidraw and gnarly_csv_files.excalidraw
- Add docs/excalidraw-to-python-agent.md
- Add requirements.txt and .gitignore
305 lines
12 KiB
Python
305 lines
12 KiB
Python
"""
|
||
Unit tests for src/csv_grok.py
|
||
================================
|
||
Every numeric assertion is verified by an independent second method so
|
||
the test itself is trustworthy and not just a tautology.
|
||
|
||
Independent verification strategy
|
||
-----------------------------------
|
||
- Means are cross-checked via sum(values) / len(values) computed
|
||
directly from the raw fixture lists — no pandas involved in the
|
||
reference calculation.
|
||
- Combined row count is verified by simple integer addition.
|
||
- File content checks use plain string search, not the module's own
|
||
formatting helpers.
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import math
|
||
from pathlib import Path
|
||
|
||
import pandas as pd
|
||
import pytest
|
||
|
||
# Make src/ importable without installing the package.
|
||
import sys
|
||
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
|
||
|
||
from csv_grok import (
|
||
AnalysisResult,
|
||
CsvBundle,
|
||
analyse,
|
||
load_and_combine,
|
||
load_csv,
|
||
write_report,
|
||
)
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Paths to the shared fixture CSVs
|
||
# ---------------------------------------------------------------------------
|
||
|
||
DATA_DIR = Path(__file__).parent.parent / "data"
|
||
FILE1 = DATA_DIR / "file1.csv"
|
||
FILE2 = DATA_DIR / "file2.csv"
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Ground-truth values derived by hand from the fixture files
|
||
# (independent of pandas – used as the reference in assertions)
|
||
# ---------------------------------------------------------------------------
|
||
|
||
# file1.csv score column: Alice=88.5, Bob=91.0, Carol=76.3, Dave=83.7, Eve=95.2
|
||
FILE1_SCORES = [88.5, 91.0, 76.3, 83.7, 95.2]
|
||
FILE1_SCORE_MEAN_REF = sum(FILE1_SCORES) / len(FILE1_SCORES) # 86.94
|
||
|
||
# file2.csv score column: Frank=70.1, Grace=88.9, Hank=65.4, Iris=79.8, Jack=82.3
|
||
FILE2_SCORES = [70.1, 88.9, 65.4, 79.8, 82.3]
|
||
FILE2_SCORE_MEAN_REF = sum(FILE2_SCORES) / len(FILE2_SCORES) # 77.30
|
||
|
||
# file1.csv salary column
|
||
FILE1_SALARIES = [72000, 65000, 85000, 70000, 90000]
|
||
FILE1_SALARY_MEAN_REF = sum(FILE1_SALARIES) / len(FILE1_SALARIES) # 76400.0
|
||
|
||
# file2.csv salary column
|
||
FILE2_SALARIES = [95000, 58000, 110000, 68000, 88000]
|
||
FILE2_SALARY_MEAN_REF = sum(FILE2_SALARIES) / len(FILE2_SALARIES) # 83800.0
|
||
|
||
|
||
# ===========================================================================
|
||
# Tests for load_csv()
|
||
# ===========================================================================
|
||
|
||
class TestLoadCsv:
|
||
"""Happy-path and structural checks for load_csv()."""
|
||
|
||
def test_returns_csv_bundle(self):
|
||
"""load_csv() must return a CsvBundle instance."""
|
||
bundle = load_csv(FILE1)
|
||
assert isinstance(bundle, CsvBundle)
|
||
|
||
def test_path_attribute_preserved(self):
|
||
"""The bundle's .path must equal the path that was passed in."""
|
||
bundle = load_csv(FILE1)
|
||
assert bundle.path == FILE1
|
||
|
||
def test_row_count_file1(self):
|
||
"""file1.csv has 5 data rows."""
|
||
bundle = load_csv(FILE1)
|
||
assert len(bundle.data) == 5
|
||
|
||
def test_row_count_file2(self):
|
||
"""file2.csv has 5 data rows."""
|
||
bundle = load_csv(FILE2)
|
||
assert len(bundle.data) == 5
|
||
|
||
def test_column_count(self):
|
||
"""Both files have exactly 4 columns."""
|
||
for path in (FILE1, FILE2):
|
||
bundle = load_csv(path)
|
||
assert len(bundle.data.columns) == 4, (
|
||
f"{path.name} should have 4 columns, got {list(bundle.data.columns)}"
|
||
)
|
||
|
||
def test_expected_columns_present(self):
|
||
"""Columns name, age, score, salary must all be present."""
|
||
expected = {"name", "age", "score", "salary"}
|
||
for path in (FILE1, FILE2):
|
||
bundle = load_csv(path)
|
||
assert expected == set(bundle.data.columns)
|
||
|
||
def test_data_is_dataframe(self):
|
||
"""bundle.data must be a pandas DataFrame."""
|
||
bundle = load_csv(FILE1)
|
||
assert isinstance(bundle.data, pd.DataFrame)
|
||
|
||
def test_missing_file_raises(self, tmp_path):
|
||
"""load_csv() must raise when the file does not exist."""
|
||
with pytest.raises(Exception):
|
||
load_csv(tmp_path / "nonexistent.csv")
|
||
|
||
|
||
# ===========================================================================
|
||
# Tests for load_and_combine()
|
||
# ===========================================================================
|
||
|
||
class TestLoadAndCombine:
|
||
"""Tests for the Combine (union) pipeline step."""
|
||
|
||
def test_returns_three_tuple(self):
|
||
"""load_and_combine() must return a 3-tuple."""
|
||
result = load_and_combine(FILE1, FILE2)
|
||
assert len(result) == 3
|
||
|
||
def test_bundles_are_csv_bundles(self):
|
||
"""First two elements of the tuple must be CsvBundle instances."""
|
||
b1, b2, _ = load_and_combine(FILE1, FILE2)
|
||
assert isinstance(b1, CsvBundle)
|
||
assert isinstance(b2, CsvBundle)
|
||
|
||
def test_combined_is_dataframe(self):
|
||
"""Third element must be a pandas DataFrame."""
|
||
_, _, combined = load_and_combine(FILE1, FILE2)
|
||
assert isinstance(combined, pd.DataFrame)
|
||
|
||
def test_combined_row_count(self):
|
||
"""Combined DataFrame must have len(file1) + len(file2) rows.
|
||
|
||
Independent check: load each file separately with pd.read_csv and
|
||
add their lengths — no call to load_and_combine() in the reference.
|
||
"""
|
||
b1, b2, combined = load_and_combine(FILE1, FILE2)
|
||
|
||
# Reference: independent row counts via direct pd.read_csv
|
||
ref_rows = len(pd.read_csv(FILE1)) + len(pd.read_csv(FILE2))
|
||
|
||
assert len(combined) == ref_rows
|
||
assert len(combined) == len(b1.data) + len(b2.data)
|
||
|
||
def test_combined_preserves_columns(self):
|
||
"""Combined DataFrame must retain all 4 original columns."""
|
||
_, _, combined = load_and_combine(FILE1, FILE2)
|
||
assert set(combined.columns) == {"name", "age", "score", "salary"}
|
||
|
||
def test_combined_index_is_reset(self):
|
||
"""Combined DataFrame index must be 0-based and contiguous."""
|
||
_, _, combined = load_and_combine(FILE1, FILE2)
|
||
expected_index = list(range(len(combined)))
|
||
assert list(combined.index) == expected_index
|
||
|
||
def test_bundle_paths_are_correct(self):
|
||
"""Each bundle must carry the path it was loaded from."""
|
||
b1, b2, _ = load_and_combine(FILE1, FILE2)
|
||
assert b1.path == FILE1
|
||
assert b2.path == FILE2
|
||
|
||
|
||
# ===========================================================================
|
||
# Tests for analyse()
|
||
# ===========================================================================
|
||
|
||
class TestAnalyse:
|
||
"""Tests for the Analyse pipeline step — means and difference."""
|
||
|
||
@pytest.fixture(autouse=True)
|
||
def _bundles(self):
|
||
self.b1, self.b2, _ = load_and_combine(FILE1, FILE2)
|
||
|
||
# --- happy-path: score column -------------------------------------------
|
||
|
||
def test_returns_analysis_result(self):
|
||
"""analyse() must return an AnalysisResult instance."""
|
||
result = analyse(self.b1, self.b2, "score")
|
||
assert isinstance(result, AnalysisResult)
|
||
|
||
def test_column_attribute(self):
|
||
"""result.column must equal the column name passed in."""
|
||
result = analyse(self.b1, self.b2, "score")
|
||
assert result.column == "score"
|
||
|
||
def test_mean_file1_score(self):
|
||
"""mean_file1 for 'score' must match the hand-computed reference.
|
||
|
||
Reference: sum(FILE1_SCORES) / len(FILE1_SCORES) — no pandas.
|
||
"""
|
||
result = analyse(self.b1, self.b2, "score")
|
||
assert math.isclose(result.mean_file1, FILE1_SCORE_MEAN_REF, rel_tol=1e-9)
|
||
|
||
def test_mean_file2_score(self):
|
||
"""mean_file2 for 'score' must match the hand-computed reference."""
|
||
result = analyse(self.b1, self.b2, "score")
|
||
assert math.isclose(result.mean_file2, FILE2_SCORE_MEAN_REF, rel_tol=1e-9)
|
||
|
||
def test_difference_score(self):
|
||
"""difference must equal mean_file1 − mean_file2 (verified independently)."""
|
||
result = analyse(self.b1, self.b2, "score")
|
||
expected_diff = FILE1_SCORE_MEAN_REF - FILE2_SCORE_MEAN_REF
|
||
assert math.isclose(result.difference, expected_diff, rel_tol=1e-9)
|
||
# Also verify the internal consistency of the dataclass fields
|
||
assert math.isclose(result.difference, result.mean_file1 - result.mean_file2, rel_tol=1e-9)
|
||
|
||
# --- happy-path: salary column ------------------------------------------
|
||
|
||
def test_mean_file1_salary(self):
|
||
"""mean_file1 for 'salary' must match the hand-computed reference."""
|
||
result = analyse(self.b1, self.b2, "salary")
|
||
assert math.isclose(result.mean_file1, FILE1_SALARY_MEAN_REF, rel_tol=1e-9)
|
||
|
||
def test_mean_file2_salary(self):
|
||
"""mean_file2 for 'salary' must match the hand-computed reference."""
|
||
result = analyse(self.b1, self.b2, "salary")
|
||
assert math.isclose(result.mean_file2, FILE2_SALARY_MEAN_REF, rel_tol=1e-9)
|
||
|
||
def test_difference_salary(self):
|
||
"""difference for 'salary' must equal mean_file1 − mean_file2."""
|
||
result = analyse(self.b1, self.b2, "salary")
|
||
expected_diff = FILE1_SALARY_MEAN_REF - FILE2_SALARY_MEAN_REF
|
||
assert math.isclose(result.difference, expected_diff, rel_tol=1e-9)
|
||
|
||
# --- error path ---------------------------------------------------------
|
||
|
||
def test_missing_column_raises_value_error(self):
|
||
"""analyse() must raise ValueError for a column that does not exist."""
|
||
with pytest.raises(ValueError, match="not found"):
|
||
analyse(self.b1, self.b2, "nonexistent_column")
|
||
|
||
def test_error_message_contains_column_name(self):
|
||
"""The ValueError message must name the missing column."""
|
||
bad_col = "ghost_column"
|
||
with pytest.raises(ValueError, match=bad_col):
|
||
analyse(self.b1, self.b2, bad_col)
|
||
|
||
def test_non_numeric_column_raises_or_returns_nan(self):
|
||
"""Requesting the 'name' (string) column should either raise or return NaN mean."""
|
||
try:
|
||
result = analyse(self.b1, self.b2, "name")
|
||
# If it doesn't raise, the means must be NaN (pandas behaviour for strings)
|
||
assert math.isnan(result.mean_file1) or math.isnan(result.mean_file2)
|
||
except (TypeError, ValueError):
|
||
pass # raising is also acceptable
|
||
|
||
|
||
# ===========================================================================
|
||
# Tests for write_report()
|
||
# ===========================================================================
|
||
|
||
class TestWriteReport:
|
||
"""Tests for the report-writing step."""
|
||
|
||
@pytest.fixture()
|
||
def result(self):
|
||
b1, b2, _ = load_and_combine(FILE1, FILE2)
|
||
return analyse(b1, b2, "score")
|
||
|
||
def test_file_is_created(self, tmp_path, result):
|
||
"""write_report() must create the output file."""
|
||
out = tmp_path / "report.txt"
|
||
write_report(result, out)
|
||
assert out.exists()
|
||
|
||
def test_file_contains_column_name(self, tmp_path, result):
|
||
"""The report must mention the column that was analysed."""
|
||
out = tmp_path / "report.txt"
|
||
write_report(result, out)
|
||
assert "score" in out.read_text()
|
||
|
||
def test_file_contains_mean1(self, tmp_path, result):
|
||
"""The report must contain the mean of file 1 (to 4 decimal places)."""
|
||
out = tmp_path / "report.txt"
|
||
write_report(result, out)
|
||
# Independent reference: format the hand-computed value the same way
|
||
expected_str = f"{FILE1_SCORE_MEAN_REF:.4f}"
|
||
assert expected_str in out.read_text()
|
||
|
||
def test_file_contains_mean2(self, tmp_path, result):
|
||
"""The report must contain the mean of file 2 (to 4 decimal places)."""
|
||
out = tmp_path / "report.txt"
|
||
write_report(result, out)
|
||
expected_str = f"{FILE2_SCORE_MEAN_REF:.4f}"
|
||
assert expected_str in out.read_text()
|
||
|
||
def test_file_is_non_empty(self, tmp_path, result):
|
||
"""The report file must not be empty."""
|
||
out = tmp_path / "report.txt"
|
||
write_report(result, out)
|
||
assert out.stat().st_size > 0
|