Files
project_for_kamlesh/tests/test_csv_grok.py

305 lines
12 KiB
Python
Raw Normal View History

"""
Unit tests for src/csv_grok.py
================================
Every numeric assertion is verified by an independent second method so
the test itself is trustworthy and not just a tautology.
Independent verification strategy
-----------------------------------
- Means are cross-checked via sum(values) / len(values) computed
directly from the raw fixture lists no pandas involved in the
reference calculation.
- Combined row count is verified by simple integer addition.
- File content checks use plain string search, not the module's own
formatting helpers.
"""
from __future__ import annotations
import math
from pathlib import Path
import pandas as pd
import pytest
# Make src/ importable without installing the package.
import sys
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
from csv_grok import (
AnalysisResult,
CsvBundle,
analyse,
load_and_combine,
load_csv,
write_report,
)
# ---------------------------------------------------------------------------
# Paths to the shared fixture CSVs
# ---------------------------------------------------------------------------
DATA_DIR = Path(__file__).parent.parent / "data"
FILE1 = DATA_DIR / "file1.csv"
FILE2 = DATA_DIR / "file2.csv"
# ---------------------------------------------------------------------------
# Ground-truth values derived by hand from the fixture files
# (independent of pandas used as the reference in assertions)
# ---------------------------------------------------------------------------
# file1.csv score column: Alice=88.5, Bob=91.0, Carol=76.3, Dave=83.7, Eve=95.2
FILE1_SCORES = [88.5, 91.0, 76.3, 83.7, 95.2]
FILE1_SCORE_MEAN_REF = sum(FILE1_SCORES) / len(FILE1_SCORES) # 86.94
# file2.csv score column: Frank=70.1, Grace=88.9, Hank=65.4, Iris=79.8, Jack=82.3
FILE2_SCORES = [70.1, 88.9, 65.4, 79.8, 82.3]
FILE2_SCORE_MEAN_REF = sum(FILE2_SCORES) / len(FILE2_SCORES) # 77.30
# file1.csv salary column
FILE1_SALARIES = [72000, 65000, 85000, 70000, 90000]
FILE1_SALARY_MEAN_REF = sum(FILE1_SALARIES) / len(FILE1_SALARIES) # 76400.0
# file2.csv salary column
FILE2_SALARIES = [95000, 58000, 110000, 68000, 88000]
FILE2_SALARY_MEAN_REF = sum(FILE2_SALARIES) / len(FILE2_SALARIES) # 83800.0
# ===========================================================================
# Tests for load_csv()
# ===========================================================================
class TestLoadCsv:
"""Happy-path and structural checks for load_csv()."""
def test_returns_csv_bundle(self):
"""load_csv() must return a CsvBundle instance."""
bundle = load_csv(FILE1)
assert isinstance(bundle, CsvBundle)
def test_path_attribute_preserved(self):
"""The bundle's .path must equal the path that was passed in."""
bundle = load_csv(FILE1)
assert bundle.path == FILE1
def test_row_count_file1(self):
"""file1.csv has 5 data rows."""
bundle = load_csv(FILE1)
assert len(bundle.data) == 5
def test_row_count_file2(self):
"""file2.csv has 5 data rows."""
bundle = load_csv(FILE2)
assert len(bundle.data) == 5
def test_column_count(self):
"""Both files have exactly 4 columns."""
for path in (FILE1, FILE2):
bundle = load_csv(path)
assert len(bundle.data.columns) == 4, (
f"{path.name} should have 4 columns, got {list(bundle.data.columns)}"
)
def test_expected_columns_present(self):
"""Columns name, age, score, salary must all be present."""
expected = {"name", "age", "score", "salary"}
for path in (FILE1, FILE2):
bundle = load_csv(path)
assert expected == set(bundle.data.columns)
def test_data_is_dataframe(self):
"""bundle.data must be a pandas DataFrame."""
bundle = load_csv(FILE1)
assert isinstance(bundle.data, pd.DataFrame)
def test_missing_file_raises(self, tmp_path):
"""load_csv() must raise when the file does not exist."""
with pytest.raises(Exception):
load_csv(tmp_path / "nonexistent.csv")
# ===========================================================================
# Tests for load_and_combine()
# ===========================================================================
class TestLoadAndCombine:
"""Tests for the Combine (union) pipeline step."""
def test_returns_three_tuple(self):
"""load_and_combine() must return a 3-tuple."""
result = load_and_combine(FILE1, FILE2)
assert len(result) == 3
def test_bundles_are_csv_bundles(self):
"""First two elements of the tuple must be CsvBundle instances."""
b1, b2, _ = load_and_combine(FILE1, FILE2)
assert isinstance(b1, CsvBundle)
assert isinstance(b2, CsvBundle)
def test_combined_is_dataframe(self):
"""Third element must be a pandas DataFrame."""
_, _, combined = load_and_combine(FILE1, FILE2)
assert isinstance(combined, pd.DataFrame)
def test_combined_row_count(self):
"""Combined DataFrame must have len(file1) + len(file2) rows.
Independent check: load each file separately with pd.read_csv and
add their lengths no call to load_and_combine() in the reference.
"""
b1, b2, combined = load_and_combine(FILE1, FILE2)
# Reference: independent row counts via direct pd.read_csv
ref_rows = len(pd.read_csv(FILE1)) + len(pd.read_csv(FILE2))
assert len(combined) == ref_rows
assert len(combined) == len(b1.data) + len(b2.data)
def test_combined_preserves_columns(self):
"""Combined DataFrame must retain all 4 original columns."""
_, _, combined = load_and_combine(FILE1, FILE2)
assert set(combined.columns) == {"name", "age", "score", "salary"}
def test_combined_index_is_reset(self):
"""Combined DataFrame index must be 0-based and contiguous."""
_, _, combined = load_and_combine(FILE1, FILE2)
expected_index = list(range(len(combined)))
assert list(combined.index) == expected_index
def test_bundle_paths_are_correct(self):
"""Each bundle must carry the path it was loaded from."""
b1, b2, _ = load_and_combine(FILE1, FILE2)
assert b1.path == FILE1
assert b2.path == FILE2
# ===========================================================================
# Tests for analyse()
# ===========================================================================
class TestAnalyse:
"""Tests for the Analyse pipeline step — means and difference."""
@pytest.fixture(autouse=True)
def _bundles(self):
self.b1, self.b2, _ = load_and_combine(FILE1, FILE2)
# --- happy-path: score column -------------------------------------------
def test_returns_analysis_result(self):
"""analyse() must return an AnalysisResult instance."""
result = analyse(self.b1, self.b2, "score")
assert isinstance(result, AnalysisResult)
def test_column_attribute(self):
"""result.column must equal the column name passed in."""
result = analyse(self.b1, self.b2, "score")
assert result.column == "score"
def test_mean_file1_score(self):
"""mean_file1 for 'score' must match the hand-computed reference.
Reference: sum(FILE1_SCORES) / len(FILE1_SCORES) no pandas.
"""
result = analyse(self.b1, self.b2, "score")
assert math.isclose(result.mean_file1, FILE1_SCORE_MEAN_REF, rel_tol=1e-9)
def test_mean_file2_score(self):
"""mean_file2 for 'score' must match the hand-computed reference."""
result = analyse(self.b1, self.b2, "score")
assert math.isclose(result.mean_file2, FILE2_SCORE_MEAN_REF, rel_tol=1e-9)
def test_difference_score(self):
"""difference must equal mean_file1 mean_file2 (verified independently)."""
result = analyse(self.b1, self.b2, "score")
expected_diff = FILE1_SCORE_MEAN_REF - FILE2_SCORE_MEAN_REF
assert math.isclose(result.difference, expected_diff, rel_tol=1e-9)
# Also verify the internal consistency of the dataclass fields
assert math.isclose(result.difference, result.mean_file1 - result.mean_file2, rel_tol=1e-9)
# --- happy-path: salary column ------------------------------------------
def test_mean_file1_salary(self):
"""mean_file1 for 'salary' must match the hand-computed reference."""
result = analyse(self.b1, self.b2, "salary")
assert math.isclose(result.mean_file1, FILE1_SALARY_MEAN_REF, rel_tol=1e-9)
def test_mean_file2_salary(self):
"""mean_file2 for 'salary' must match the hand-computed reference."""
result = analyse(self.b1, self.b2, "salary")
assert math.isclose(result.mean_file2, FILE2_SALARY_MEAN_REF, rel_tol=1e-9)
def test_difference_salary(self):
"""difference for 'salary' must equal mean_file1 mean_file2."""
result = analyse(self.b1, self.b2, "salary")
expected_diff = FILE1_SALARY_MEAN_REF - FILE2_SALARY_MEAN_REF
assert math.isclose(result.difference, expected_diff, rel_tol=1e-9)
# --- error path ---------------------------------------------------------
def test_missing_column_raises_value_error(self):
"""analyse() must raise ValueError for a column that does not exist."""
with pytest.raises(ValueError, match="not found"):
analyse(self.b1, self.b2, "nonexistent_column")
def test_error_message_contains_column_name(self):
"""The ValueError message must name the missing column."""
bad_col = "ghost_column"
with pytest.raises(ValueError, match=bad_col):
analyse(self.b1, self.b2, bad_col)
def test_non_numeric_column_raises_or_returns_nan(self):
"""Requesting the 'name' (string) column should either raise or return NaN mean."""
try:
result = analyse(self.b1, self.b2, "name")
# If it doesn't raise, the means must be NaN (pandas behaviour for strings)
assert math.isnan(result.mean_file1) or math.isnan(result.mean_file2)
except (TypeError, ValueError):
pass # raising is also acceptable
# ===========================================================================
# Tests for write_report()
# ===========================================================================
class TestWriteReport:
"""Tests for the report-writing step."""
@pytest.fixture()
def result(self):
b1, b2, _ = load_and_combine(FILE1, FILE2)
return analyse(b1, b2, "score")
def test_file_is_created(self, tmp_path, result):
"""write_report() must create the output file."""
out = tmp_path / "report.txt"
write_report(result, out)
assert out.exists()
def test_file_contains_column_name(self, tmp_path, result):
"""The report must mention the column that was analysed."""
out = tmp_path / "report.txt"
write_report(result, out)
assert "score" in out.read_text()
def test_file_contains_mean1(self, tmp_path, result):
"""The report must contain the mean of file 1 (to 4 decimal places)."""
out = tmp_path / "report.txt"
write_report(result, out)
# Independent reference: format the hand-computed value the same way
expected_str = f"{FILE1_SCORE_MEAN_REF:.4f}"
assert expected_str in out.read_text()
def test_file_contains_mean2(self, tmp_path, result):
"""The report must contain the mean of file 2 (to 4 decimal places)."""
out = tmp_path / "report.txt"
write_report(result, out)
expected_str = f"{FILE2_SCORE_MEAN_REF:.4f}"
assert expected_str in out.read_text()
def test_file_is_non_empty(self, tmp_path, result):
"""The report file must not be empty."""
out = tmp_path / "report.txt"
write_report(result, out)
assert out.stat().st_size > 0