manuel-reyes-ml · manuel-reyes-ml · Jan 9, 2026 · Jan 9, 2026 · Jan 9, 2026 · Jan 9, 2026
diff --git a/README.md b/README.md
@@ -176,6 +176,22 @@ df["txn_date"] = to_date_series(df["txn_date"])
 df["tax_code_1"] = normalize_tax_code_series(df["tax_code_1"])
 ```
 
+Optional date filtering (range + months) is configurable via `DateFilterConfig`:
+```python
+from src.config import DateFilterConfig
+from src.cleaning.clean_matrix import clean_matrix
+from src.cleaning.clean_relius import clean_relius
+
+date_filter = DateFilterConfig(
+    date_start="2025-07-01",
+    date_end="2025-09-30",
+    months=["July", "Aug", 9],
+)
+
+matrix_clean = clean_matrix(matrix_raw, date_filter=date_filter)
+relius_clean = clean_relius(relius_raw, date_filter=date_filter)
+```
+
 **Results:** Clean data written to `data/processed/` (gitignored)
 
 ---
@@ -273,6 +289,7 @@ New First Year contrib | Reason | Action
 │   │   ├── __init__.py
 │   │   ├── load_data.py                # Excel → pandas DataFrames
 │   │   ├── normalizers.py              # Canonical field normalization
+│   │   ├── validators.py               # Validation helpers
 │   │   └── generate_sample_data.py     # Synthetic sample generator
 │   ├── cleaning/
 │   │   ├── __init__.py

diff --git a/docs/matching_logic.md b/docs/matching_logic.md
@@ -220,6 +220,12 @@ Canonical normalization extracts **1–2 leading characters**:
 Plan IDs are stripped and normalized for consistent matching and Roth plan
 identification (case-insensitive prefixes/suffixes).
 
+### 2.6 Date filtering (🟡 Important)
+- Optional transaction filters are configured via `DateFilterConfig` in `src/config.py`.
+- Filters support `date_start`, `date_end`, and `months` (month names or numbers).
+- When both range and months are provided, filters intersect (not union).
+- When filters are active, rows with missing/invalid dates are excluded.
+
 ---
 
 ## 3. Engine A — Relius ↔ Matrix Reconciliation

diff --git a/notebooks/03_match_planid_analysis.ipynb b/notebooks/03_match_planid_analysis.ipynb
@@ -598,7 +598,7 @@
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
-       "<p>20 rows × 48 columns</p>\n",
+       "<p>20 rows \u00d7 48 columns</p>\n",
        "</div>"
       ],
       "text/plain": [
@@ -720,46 +720,22 @@
      "output_type": "execute_result"
     }
    ],
+   "source": "# Cell 2 \u2014 Imports, Load & Clean inputs (real paths)\n\nfrom src.cleaning.clean_relius import clean_relius\nfrom src.cleaning.clean_matrix import clean_matrix\nfrom src.engines.match_planid import reconcile_relius_matrix\nfrom src.core import load_data\nfrom src.config import RAW_DATA_DIR, USE_SAMPLE_DATA_DEFAULT, DateFilterConfig\n\n\nif USE_SAMPLE_DATA_DEFAULT:\n    matrix_path = None\n    relius_path = None\nelse:\n    matrix_path = RAW_DATA_DIR / \"real_all_matrix_2025.xlsx\"\n    relius_path = RAW_DATA_DIR / \"real_inherited_relius_2025.xlsx\"\n\n# Optional date filtering (set to None for \"All\")\ndate_filter = None\n# date_filter = DateFilterConfig(date_start=\"2025-07-01\", date_end=\"2025-09-30\", months=[\"July\", \"Aug\", 9])\n\n\n# Load raw data to DataFrames\nrelius_raw = load_data.load_relius_excel(path=relius_path)\nmatrix_raw = load_data.load_matrix_excel(path=matrix_path)\n\n# Clean DataFrames\nrelius_clean = clean_relius(relius_raw, date_filter=date_filter)\nmatrix_clean = clean_matrix(matrix_raw, date_filter=date_filter)\n\n# Only inherited plans for now:\ninherited_plans = [\"300004PLAT\", \"300004MBD\", \"300004MBDII\"]\n\nmatched = reconcile_relius_matrix(\n    relius_clean,\n    matrix_clean,\n    plan_ids=inherited_plans,\n    apply_business_rules=True,\n)\n\n#.shape is an attribute of pandas DataFrames that returns a tuple of\n# (number of rows, number of columns) - e.g. (1000, 15)\nprint(matched.shape)\nmatched.head(20)"
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b3bfa50902594dbd8b2d8325d24e0966",
+   "metadata": {},
    "source": [
-    "# Cell 2 — Imports, Load & Clean inputs (real paths)\n",
-    "\n",
-    "from src.cleaning.clean_relius import clean_relius\n",
-    "from src.cleaning.clean_matrix import clean_matrix\n",
-    "from src.engines.match_planid import reconcile_relius_matrix\n",
-    "from src.core import load_data\n",
-    "from src.config import RAW_DATA_DIR, USE_SAMPLE_DATA_DEFAULT\n",
-    "\n",
-    "\n",
-    "if USE_SAMPLE_DATA_DEFAULT:\n",
-    "    matrix_path = None\n",
-    "    relius_path = None\n",
-    "else:\n",
-    "    matrix_path = RAW_DATA_DIR / \"real_all_matrix_2025.xlsx\"\n",
-    "    relius_path = RAW_DATA_DIR / \"real_inherited_relius_2025.xlsx\"\n",
-    "\n",
+    "### Date filter options\n",
+    "Use `DateFilterConfig` to limit transactions by date range and/or months. Range and months intersect.\n",
+    "Set `date_filter = None` for all data. Missing/invalid dates are excluded when filters are active.\n",
     "\n",
-    "# Load raw data to DataFrames\n",
-    "relius_raw = load_data.load_relius_excel(path=relius_path)\n",
-    "matrix_raw = load_data.load_matrix_excel(path=matrix_path)\n",
-    "\n",
-    "# Clean DataFrames\n",
-    "relius_clean = clean_relius(relius_raw)\n",
-    "matrix_clean = clean_matrix(matrix_raw)\n",
-    "\n",
-    "# Only inherited plans for now:\n",
-    "inherited_plans = [\"300004PLAT\", \"300004MBD\", \"300004MBDII\"]\n",
-    "\n",
-    "matched = reconcile_relius_matrix(\n",
-    "    relius_clean,\n",
-    "    matrix_clean,\n",
-    "    plan_ids=inherited_plans,\n",
-    "    apply_business_rules=True,\n",
-    ")\n",
-    "\n",
-    "#.shape is an attribute of pandas DataFrames that returns a tuple of\n",
-    "# (number of rows, number of columns) - e.g. (1000, 15)\n",
-    "print(matched.shape)\n",
-    "matched.head(20)"
+    "Examples:\n",
+    "- All data: `date_filter = None`\n",
+    "- Range only: `DateFilterConfig(date_start=\"2025-01-01\", date_end=\"2025-01-31\")`\n",
+    "- Months only: `DateFilterConfig(months=[\"July\", 8])`\n",
+    "- Range + months: `DateFilterConfig(date_start=\"2025-07-01\", date_end=\"2025-09-30\", months=[\"July\", \"Aug\"])`\n"
    ]
   },
   {
@@ -806,15 +782,7 @@
      "output_type": "execute_result"
     }
    ],
-   "source": [
-    "# Cell 3 — Identify duplication issue\n",
-    "\n",
-    "# How many times each Matrix transaction id appears in the merged df\n",
-    "tx_id_counts = matched[\"transaction_id\"].value_counts()\n",
-    "\n",
-    "# Show only those that appear more than once\n",
-    "tx_id_counts[tx_id_counts > 1].head(10)"
-   ]
+   "source": "# Cell 3 \u2014 Identify duplication issue\n\n# How many times each Matrix transaction id appears in the merged df\ntx_id_counts = matched[\"transaction_id\"].value_counts()\n\n# Show only those that appear more than once\ntx_id_counts[tx_id_counts > 1].head(10)"
   },
   {
    "cell_type": "markdown",
@@ -856,16 +824,7 @@
      "output_type": "execute_result"
     }
    ],
-   "source": [
-    "# Cell 4 — Implement transaction_date vs export_date tolerance to filter matched transactions\n",
-    "\n",
-    "matches_in_range = matched[\n",
-    "    (matched[\"_merge\"] == \"both\") &\n",
-    "    (matched[\"date_within_tolerance\"])\n",
-    "].copy()\n",
-    "\n",
-    "matches_in_range[\"match_status\"].value_counts()"
-   ]
+   "source": "# Cell 4 \u2014 Implement transaction_date vs export_date tolerance to filter matched transactions\n\nmatches_in_range = matched[\n    (matched[\"_merge\"] == \"both\") &\n    (matched[\"date_within_tolerance\"])\n].copy()\n\nmatches_in_range[\"match_status\"].value_counts()"
   },
   {
    "cell_type": "markdown",
@@ -907,22 +866,7 @@
      "output_type": "execute_result"
     }
    ],
-   "source": [
-    "# Cell 5 — Identify best match\n",
-    "\n",
-    "# Sort so the \"best\" match (smallest lag) is first for each transaction_id\n",
-    "matches_in_range = matches_in_range.sort_values(\n",
-    "    [\"transaction_id\", \"date_lag_days\"]\n",
-    ")\n",
-    "\n",
-    "# Keep only the first row per transaction_id\n",
-    "primary_matches = matches_in_range.drop_duplicates(\n",
-    "    subset=[\"transaction_id\"],\n",
-    "    keep=\"first\"\n",
-    ")\n",
-    "\n",
-    "primary_matches[\"transaction_id\"].value_counts().head(10)"
-   ]
+   "source": "# Cell 5 \u2014 Identify best match\n\n# Sort so the \"best\" match (smallest lag) is first for each transaction_id\nmatches_in_range = matches_in_range.sort_values(\n    [\"transaction_id\", \"date_lag_days\"]\n)\n\n# Keep only the first row per transaction_id\nprimary_matches = matches_in_range.drop_duplicates(\n    subset=[\"transaction_id\"],\n    keep=\"first\"\n)\n\nprimary_matches[\"transaction_id\"].value_counts().head(10)"
   },
   {
    "cell_type": "markdown",
@@ -1532,33 +1476,7 @@
      "output_type": "execute_result"
     }
    ],
-   "source": [
-    "# Cell 6 — New shape after date tolerance filter has been applied\n",
-    "\n",
-    "print(primary_matches.shape)\n",
-    "primary_matches[\n",
-    "    [\n",
-    "        \"plan_id\",\n",
-    "        \"first_name\",\n",
-    "        \"last_name\",\n",
-    "        \"ssn\",\n",
-    "        \"gross_amt\",\n",
-    "        \"exported_date\",\n",
-    "        \"txn_date\",\n",
-    "        \"date_lag_days\",\n",
-    "        \"dist_name\",\n",
-    "        \"transaction_id\",\n",
-    "        \"match_status\",\n",
-    "        \"tax_code_1\",\n",
-    "        \"tax_code_2\",\n",
-    "        \"expected_tax_code_1\",\n",
-    "        \"expected_tax_code_2\",\n",
-    "        \"suggested_tax_code_1\",\n",
-    "        \"suggested_tax_code_2\",\n",
-    "        \"correction_reason\",\n",
-    "    ]\n",
-    "].head(20)"
-   ]
+   "source": "# Cell 6 \u2014 New shape after date tolerance filter has been applied\n\nprint(primary_matches.shape)\nprimary_matches[\n    [\n        \"plan_id\",\n        \"first_name\",\n        \"last_name\",\n        \"ssn\",\n        \"gross_amt\",\n        \"exported_date\",\n        \"txn_date\",\n        \"date_lag_days\",\n        \"dist_name\",\n        \"transaction_id\",\n        \"match_status\",\n        \"tax_code_1\",\n        \"tax_code_2\",\n        \"expected_tax_code_1\",\n        \"expected_tax_code_2\",\n        \"suggested_tax_code_1\",\n        \"suggested_tax_code_2\",\n        \"correction_reason\",\n    ]\n].head(20)"
   },
   {
    "cell_type": "markdown",
@@ -1795,37 +1713,7 @@
      "output_type": "execute_result"
     }
    ],
-   "source": [
-    "# Cell 7 — Review specific case for recurring distrib\n",
-    "\n",
-    "cols = [\n",
-    "    \n",
-    "    \"plan_id\",\n",
-    "    \"first_name\",\n",
-    "    \"last_name\",\n",
-    "    \"ssn\",\n",
-    "    \"gross_amt\",\n",
-    "    \"exported_date\",\n",
-    "    \"txn_date\",\n",
-    "    \"date_lag_days\",\n",
-    "    \"dist_name\",\n",
-    "    \"transaction_id\",\n",
-    "    \"match_status\",\n",
-    "    \"tax_code_1\",\n",
-    "    \"tax_code_2\",\n",
-    "    \"expected_tax_code_1\",\n",
-    "    \"expected_tax_code_2\",\n",
-    "    \"suggested_tax_code_1\",\n",
-    "    \"suggested_tax_code_2\",\n",
-    "    \"correction_reason\",\n",
-    "]\n",
-    "\n",
-    "ssn_mask = primary_matches[\"ssn\"].astype(str).str.strip() == \"197526965\"\n",
-    "\n",
-    "participant = primary_matches.loc[ssn_mask, cols]\n",
-    "\n",
-    "participant.head(10)"
-   ]
+   "source": "# Cell 7 \u2014 Review specific case for recurring distrib\n\ncols = [\n    \n    \"plan_id\",\n    \"first_name\",\n    \"last_name\",\n    \"ssn\",\n    \"gross_amt\",\n    \"exported_date\",\n    \"txn_date\",\n    \"date_lag_days\",\n    \"dist_name\",\n    \"transaction_id\",\n    \"match_status\",\n    \"tax_code_1\",\n    \"tax_code_2\",\n    \"expected_tax_code_1\",\n    \"expected_tax_code_2\",\n    \"suggested_tax_code_1\",\n    \"suggested_tax_code_2\",\n    \"correction_reason\",\n]\n\nssn_mask = primary_matches[\"ssn\"].astype(str).str.strip() == \"197526965\"\n\nparticipant = primary_matches.loc[ssn_mask, cols]\n\nparticipant.head(10)"
   },
   {
    "cell_type": "markdown",
@@ -2036,7 +1924,7 @@
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
-       "<p>6 rows × 48 columns</p>\n",
+       "<p>6 rows \u00d7 48 columns</p>\n",
        "</div>"
       ],
       "text/plain": [
@@ -2088,11 +1976,7 @@
      "output_type": "execute_result"
     }
    ],
-   "source": [
-    "# Cell 8 — Testing boolean extraction\n",
-    "\n",
-    "primary_matches[ssn_mask].head(10)"
-   ]
+   "source": "# Cell 8 \u2014 Testing boolean extraction\n\nprimary_matches[ssn_mask].head(10)"
   },
   {
    "cell_type": "markdown",
@@ -2141,11 +2025,7 @@
      "output_type": "execute_result"
     }
    ],
-   "source": [
-    "# Cell 9 — Testing DataFrame ready for buid correction file\n",
-    "\n",
-    "primary_matches[\"match_status\"].value_counts()"
-   ]
+   "source": "# Cell 9 \u2014 Testing DataFrame ready for buid correction file\n\nprimary_matches[\"match_status\"].value_counts()"
   },
   {
    "cell_type": "code",
@@ -2167,11 +2047,7 @@
      "output_type": "execute_result"
     }
    ],
-   "source": [
-    "# Cell 10 — Test 'action' Series is indexed correctly\n",
-    "\n",
-    "primary_matches[\"action\"].value_counts(dropna=False)"
-   ]
+   "source": "# Cell 10 \u2014 Test 'action' Series is indexed correctly\n\nprimary_matches[\"action\"].value_counts(dropna=False)"
   },
   {
    "cell_type": "markdown",
@@ -2722,7 +2598,7 @@
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
-       "<p>20 rows × 49 columns</p>\n",
+       "<p>20 rows \u00d7 49 columns</p>\n",
        "</div>"
       ],
       "text/plain": [
@@ -2844,23 +2720,7 @@
      "output_type": "execute_result"
     }
    ],
-   "source": [
-    "# Cell 11 — Review tax codes between Matrix and Relius transactions\n",
-    "\n",
-    "expected_corrections = primary_matches[\n",
-    "    (primary_matches[\"match_status\"] == \"match_needs_correction\")\n",
-    "    & primary_matches[\"suggested_tax_code_1\"].notna()\n",
-    "]\n",
-    "\n",
-    "# Add tax_code_1 columns to the see tax code differences between Relius and Matrix\n",
-    "custom_cols = [col for col in expected_corrections.columns]\n",
-    "custom_cols.insert(9, \"tax_code_1\")                             # .insert(index, value) -> insert value in a specific index position in the List.\n",
-    "custom_cols.insert(7, \"txn_date\")\n",
-    "custom_cols.remove(\"state_relius\")\n",
-    "\n",
-    "print(expected_corrections.shape)\n",
-    "expected_corrections[custom_cols].head(20)"
-   ]
+   "source": "# Cell 11 \u2014 Review tax codes between Matrix and Relius transactions\n\nexpected_corrections = primary_matches[\n    (primary_matches[\"match_status\"] == \"match_needs_correction\")\n    & primary_matches[\"suggested_tax_code_1\"].notna()\n]\n\n# Add tax_code_1 columns to the see tax code differences between Relius and Matrix\ncustom_cols = [col for col in expected_corrections.columns]\ncustom_cols.insert(9, \"tax_code_1\")                             # .insert(index, value) -> insert value in a specific index position in the List.\ncustom_cols.insert(7, \"txn_date\")\ncustom_cols.remove(\"state_relius\")\n\nprint(expected_corrections.shape)\nexpected_corrections[custom_cols].head(20)"
   },
   {
    "cell_type": "markdown",
@@ -3029,22 +2889,7 @@
      "output_type": "execute_result"
     }
    ],
-   "source": [
-    "# Cell 12 — Build correction DataFrame\n",
-    "\n",
-    "from importlib import reload\n",
-    "import src.outputs.build_correction_file as bcf\n",
-    "reload(bcf)\n",
-    "\n",
-    "from src.outputs.build_correction_file import build_correction_dataframe\n",
-    "\n",
-    "# 1) Build the correction dataframe\n",
-    "#\n",
-    "# primary_matches is your filtered/cleaned matches DataFrame\n",
-    "corrections_df = build_correction_dataframe(primary_matches)\n",
-    "\n",
-    "corrections_df.head()"
-   ]
+   "source": "# Cell 12 \u2014 Build correction DataFrame\n\nfrom importlib import reload\nimport src.outputs.build_correction_file as bcf\nreload(bcf)\n\nfrom src.outputs.build_correction_file import build_correction_dataframe\n\n# 1) Build the correction dataframe\n#\n# primary_matches is your filtered/cleaned matches DataFrame\ncorrections_df = build_correction_dataframe(primary_matches)\n\ncorrections_df.head()"
   },
   {
    "cell_type": "markdown",
@@ -3069,17 +2914,7 @@
      ]
     }
    ],
-   "source": [
-    "# Cell 13 — Generate Excel correction file and save in Path\n",
-    "\n",
-    "from src.outputs.build_correction_file import write_correction_file\n",
-    "\n",
-    "# 2) Write it to Excel with an auto-generated timestamped name\n",
-    "output_path = write_correction_file(corrections_df, engine=\"match_planid\")\n",
-    "\n",
-    "# Run write_correction_file() function\n",
-    "print(f\"File saved successfully in: {output_path}\")"
-   ]
+   "source": "# Cell 13 \u2014 Generate Excel correction file and save in Path\n\nfrom src.outputs.build_correction_file import write_correction_file\n\n# 2) Write it to Excel with an auto-generated timestamped name\noutput_path = write_correction_file(corrections_df, engine=\"match_planid\")\n\n# Run write_correction_file() function\nprint(f\"File saved successfully in: {output_path}\")"
   },
   {
    "cell_type": "markdown",
@@ -3112,4 +2947,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 5
-}
+}