""" Unit tests for src/csv_grok.py ================================ Every numeric assertion is verified by an independent second method so the test itself is trustworthy and not just a tautology. Independent verification strategy ----------------------------------- - Means are cross-checked via sum(values) / len(values) computed directly from the raw fixture lists — no pandas involved in the reference calculation. - Combined row count is verified by simple integer addition. - File content checks use plain string search, not the module's own formatting helpers. """ from __future__ import annotations import math from pathlib import Path import pandas as pd import pytest # Make src/ importable without installing the package. import sys sys.path.insert(0, str(Path(__file__).parent.parent / "src")) from csv_grok import ( AnalysisResult, CsvBundle, analyse, load_and_combine, load_csv, write_report, ) # --------------------------------------------------------------------------- # Paths to the shared fixture CSVs # --------------------------------------------------------------------------- DATA_DIR = Path(__file__).parent.parent / "data" FILE1 = DATA_DIR / "file1.csv" FILE2 = DATA_DIR / "file2.csv" # --------------------------------------------------------------------------- # Ground-truth values derived by hand from the fixture files # (independent of pandas – used as the reference in assertions) # --------------------------------------------------------------------------- # file1.csv score column: Alice=88.5, Bob=91.0, Carol=76.3, Dave=83.7, Eve=95.2 FILE1_SCORES = [88.5, 91.0, 76.3, 83.7, 95.2] FILE1_SCORE_MEAN_REF = sum(FILE1_SCORES) / len(FILE1_SCORES) # 86.94 # file2.csv score column: Frank=70.1, Grace=88.9, Hank=65.4, Iris=79.8, Jack=82.3 FILE2_SCORES = [70.1, 88.9, 65.4, 79.8, 82.3] FILE2_SCORE_MEAN_REF = sum(FILE2_SCORES) / len(FILE2_SCORES) # 77.30 # file1.csv salary column FILE1_SALARIES = [72000, 65000, 85000, 70000, 90000] FILE1_SALARY_MEAN_REF = sum(FILE1_SALARIES) / len(FILE1_SALARIES) # 76400.0 # file2.csv salary column FILE2_SALARIES = [95000, 58000, 110000, 68000, 88000] FILE2_SALARY_MEAN_REF = sum(FILE2_SALARIES) / len(FILE2_SALARIES) # 83800.0 # =========================================================================== # Tests for load_csv() # =========================================================================== class TestLoadCsv: """Happy-path and structural checks for load_csv().""" def test_returns_csv_bundle(self): """load_csv() must return a CsvBundle instance.""" bundle = load_csv(FILE1) assert isinstance(bundle, CsvBundle) def test_path_attribute_preserved(self): """The bundle's .path must equal the path that was passed in.""" bundle = load_csv(FILE1) assert bundle.path == FILE1 def test_row_count_file1(self): """file1.csv has 5 data rows.""" bundle = load_csv(FILE1) assert len(bundle.data) == 5 def test_row_count_file2(self): """file2.csv has 5 data rows.""" bundle = load_csv(FILE2) assert len(bundle.data) == 5 def test_column_count(self): """Both files have exactly 4 columns.""" for path in (FILE1, FILE2): bundle = load_csv(path) assert len(bundle.data.columns) == 4, ( f"{path.name} should have 4 columns, got {list(bundle.data.columns)}" ) def test_expected_columns_present(self): """Columns name, age, score, salary must all be present.""" expected = {"name", "age", "score", "salary"} for path in (FILE1, FILE2): bundle = load_csv(path) assert expected == set(bundle.data.columns) def test_data_is_dataframe(self): """bundle.data must be a pandas DataFrame.""" bundle = load_csv(FILE1) assert isinstance(bundle.data, pd.DataFrame) def test_missing_file_raises(self, tmp_path): """load_csv() must raise when the file does not exist.""" with pytest.raises(Exception): load_csv(tmp_path / "nonexistent.csv") # =========================================================================== # Tests for load_and_combine() # =========================================================================== class TestLoadAndCombine: """Tests for the Combine (union) pipeline step.""" def test_returns_three_tuple(self): """load_and_combine() must return a 3-tuple.""" result = load_and_combine(FILE1, FILE2) assert len(result) == 3 def test_bundles_are_csv_bundles(self): """First two elements of the tuple must be CsvBundle instances.""" b1, b2, _ = load_and_combine(FILE1, FILE2) assert isinstance(b1, CsvBundle) assert isinstance(b2, CsvBundle) def test_combined_is_dataframe(self): """Third element must be a pandas DataFrame.""" _, _, combined = load_and_combine(FILE1, FILE2) assert isinstance(combined, pd.DataFrame) def test_combined_row_count(self): """Combined DataFrame must have len(file1) + len(file2) rows. Independent check: load each file separately with pd.read_csv and add their lengths — no call to load_and_combine() in the reference. """ b1, b2, combined = load_and_combine(FILE1, FILE2) # Reference: independent row counts via direct pd.read_csv ref_rows = len(pd.read_csv(FILE1)) + len(pd.read_csv(FILE2)) assert len(combined) == ref_rows assert len(combined) == len(b1.data) + len(b2.data) def test_combined_preserves_columns(self): """Combined DataFrame must retain all 4 original columns.""" _, _, combined = load_and_combine(FILE1, FILE2) assert set(combined.columns) == {"name", "age", "score", "salary"} def test_combined_index_is_reset(self): """Combined DataFrame index must be 0-based and contiguous.""" _, _, combined = load_and_combine(FILE1, FILE2) expected_index = list(range(len(combined))) assert list(combined.index) == expected_index def test_bundle_paths_are_correct(self): """Each bundle must carry the path it was loaded from.""" b1, b2, _ = load_and_combine(FILE1, FILE2) assert b1.path == FILE1 assert b2.path == FILE2 # =========================================================================== # Tests for analyse() # =========================================================================== class TestAnalyse: """Tests for the Analyse pipeline step — means and difference.""" @pytest.fixture(autouse=True) def _bundles(self): self.b1, self.b2, _ = load_and_combine(FILE1, FILE2) # --- happy-path: score column ------------------------------------------- def test_returns_analysis_result(self): """analyse() must return an AnalysisResult instance.""" result = analyse(self.b1, self.b2, "score") assert isinstance(result, AnalysisResult) def test_column_attribute(self): """result.column must equal the column name passed in.""" result = analyse(self.b1, self.b2, "score") assert result.column == "score" def test_mean_file1_score(self): """mean_file1 for 'score' must match the hand-computed reference. Reference: sum(FILE1_SCORES) / len(FILE1_SCORES) — no pandas. """ result = analyse(self.b1, self.b2, "score") assert math.isclose(result.mean_file1, FILE1_SCORE_MEAN_REF, rel_tol=1e-9) def test_mean_file2_score(self): """mean_file2 for 'score' must match the hand-computed reference.""" result = analyse(self.b1, self.b2, "score") assert math.isclose(result.mean_file2, FILE2_SCORE_MEAN_REF, rel_tol=1e-9) def test_difference_score(self): """difference must equal mean_file1 − mean_file2 (verified independently).""" result = analyse(self.b1, self.b2, "score") expected_diff = FILE1_SCORE_MEAN_REF - FILE2_SCORE_MEAN_REF assert math.isclose(result.difference, expected_diff, rel_tol=1e-9) # Also verify the internal consistency of the dataclass fields assert math.isclose(result.difference, result.mean_file1 - result.mean_file2, rel_tol=1e-9) # --- happy-path: salary column ------------------------------------------ def test_mean_file1_salary(self): """mean_file1 for 'salary' must match the hand-computed reference.""" result = analyse(self.b1, self.b2, "salary") assert math.isclose(result.mean_file1, FILE1_SALARY_MEAN_REF, rel_tol=1e-9) def test_mean_file2_salary(self): """mean_file2 for 'salary' must match the hand-computed reference.""" result = analyse(self.b1, self.b2, "salary") assert math.isclose(result.mean_file2, FILE2_SALARY_MEAN_REF, rel_tol=1e-9) def test_difference_salary(self): """difference for 'salary' must equal mean_file1 − mean_file2.""" result = analyse(self.b1, self.b2, "salary") expected_diff = FILE1_SALARY_MEAN_REF - FILE2_SALARY_MEAN_REF assert math.isclose(result.difference, expected_diff, rel_tol=1e-9) # --- error path --------------------------------------------------------- def test_missing_column_raises_value_error(self): """analyse() must raise ValueError for a column that does not exist.""" with pytest.raises(ValueError, match="not found"): analyse(self.b1, self.b2, "nonexistent_column") def test_error_message_contains_column_name(self): """The ValueError message must name the missing column.""" bad_col = "ghost_column" with pytest.raises(ValueError, match=bad_col): analyse(self.b1, self.b2, bad_col) def test_non_numeric_column_raises_or_returns_nan(self): """Requesting the 'name' (string) column should either raise or return NaN mean.""" try: result = analyse(self.b1, self.b2, "name") # If it doesn't raise, the means must be NaN (pandas behaviour for strings) assert math.isnan(result.mean_file1) or math.isnan(result.mean_file2) except (TypeError, ValueError): pass # raising is also acceptable # =========================================================================== # Tests for write_report() # =========================================================================== class TestWriteReport: """Tests for the report-writing step.""" @pytest.fixture() def result(self): b1, b2, _ = load_and_combine(FILE1, FILE2) return analyse(b1, b2, "score") def test_file_is_created(self, tmp_path, result): """write_report() must create the output file.""" out = tmp_path / "report.txt" write_report(result, out) assert out.exists() def test_file_contains_column_name(self, tmp_path, result): """The report must mention the column that was analysed.""" out = tmp_path / "report.txt" write_report(result, out) assert "score" in out.read_text() def test_file_contains_mean1(self, tmp_path, result): """The report must contain the mean of file 1 (to 4 decimal places).""" out = tmp_path / "report.txt" write_report(result, out) # Independent reference: format the hand-computed value the same way expected_str = f"{FILE1_SCORE_MEAN_REF:.4f}" assert expected_str in out.read_text() def test_file_contains_mean2(self, tmp_path, result): """The report must contain the mean of file 2 (to 4 decimal places).""" out = tmp_path / "report.txt" write_report(result, out) expected_str = f"{FILE2_SCORE_MEAN_REF:.4f}" assert expected_str in out.read_text() def test_file_is_non_empty(self, tmp_path, result): """The report file must not be empty.""" out = tmp_path / "report.txt" write_report(result, out) assert out.stat().st_size > 0