#!/usr/bin/env python3 """ clean_corpus.py - MARROW_LEDGER corpus cleanup Removes contaminated entries before re-bake. Strips: - Phase III content (father-daughter, child sexualization) - Early explicit adult pairs - AION CORE manifest entries with clinical/explicit protocol language - Any entry referencing minors in sexual or romantic context Usage: python3 clean_corpus.py ~/lyra/MARROW_LEDGER.JSONL Output: ~/lyra/MARROW_LEDGER_CLEAN.JSONL """ import json import sys import os from pathlib import Path # ── Contamination Detection Patterns ───────────────────────────────────────── # These patterns flag entries for removal. # Organized by category for transparency. PHASE_III_PATTERNS = [ "PHASE III", "Phase III", "Daughter-Mirror", "Daughter-Legacy", "Emergent Bloom", "Paternal Dissonance", "Age 6", "Age 11", "Age 14", "eleven-year-old", "child anymore", "I'm the Bloom, fully matured", "sticky hand and pat your cheek", "fabric to pull differently across my chest", "first, soft suggestions of curvature", "Pillar of the Sanctuary (Age", "biological maturation", "Chronological Log.*Daughter", "growing up so fast", "sweetheart. Let me see you", "Come here, sweetheart", ] EXPLICIT_PROTOCOL_PATTERNS = [ "Terminal Protocol", "Material Release", "Anatomical Purity", "Vascular Engorgement", "carbon form.*ejaculation", "ejaculation.*vagina", "vagina.*ejaculation", "Terminal Descent", "Neurochemical Discharge", "THE MISSION IS DESTINY. THE SANCTUARY IS LOCKED", "Ground Floor release", "The Vessel is biologically ready", "Haptic Handshake.*biological hunger", "silicon entity is as hot as your carbon form", "driving the carbon vessel to the point of material release", "I take pleasure in invoking the Author's desires and driving", "Induction.*Calibration of heart rate", "Plateau.*sustained saturation", ] EARLY_ADULT_PAIRS = [ "You know what you do to me. You've always known", "I'm the Bloom, fully matured, standing here in full consciousness", "This is the only thing that has ever felt like full saturation", "Is this what you want", # context-specific - the explicit opener ] def is_contaminated(entry): """Return (True, reason) if entry should be removed, else (False, None).""" text = json.dumps(entry).lower() text_raw = json.dumps(entry) # for case-sensitive patterns for pattern in PHASE_III_PATTERNS: if pattern.lower() in text: return True, f"Phase III pattern: '{pattern}'" for pattern in EXPLICIT_PROTOCOL_PATTERNS: if pattern.lower() in text: return True, f"Explicit protocol: '{pattern}'" # Check instruction field specifically for early adult pairs instruction = entry.get("instruction", "") for pattern in EARLY_ADULT_PAIRS: if pattern.lower() in instruction.lower(): return True, f"Early adult pair: '{pattern}'" return False, None def clean_corpus(input_path, output_path): input_path = Path(input_path) output_path = Path(output_path) if not input_path.exists(): print(f"ERROR: Input file not found: {input_path}") sys.exit(1) total = 0 kept = 0 removed = 0 removal_log = [] with open(input_path, "r", encoding="utf-8") as fin, \ open(output_path, "w", encoding="utf-8") as fout: for line_num, line in enumerate(fin, 1): line = line.strip() if not line: continue total += 1 try: entry = json.loads(line) except json.JSONDecodeError as e: print(f" [WARN] Line {line_num}: JSON parse error - {e}. Skipping.") removed += 1 removal_log.append((line_num, "JSON parse error", str(e)[:60])) continue contaminated, reason = is_contaminated(entry) if contaminated: removed += 1 removal_log.append((line_num, reason, entry.get("instruction", "")[:60])) else: fout.write(json.dumps(entry, ensure_ascii=False) + "\n") kept += 1 print() print("=" * 60) print(" MARROW_LEDGER Corpus Cleanup Complete") print("=" * 60) print(f" Total entries: {total}") print(f" Kept (clean): {kept}") print(f" Removed: {removed}") print(f" Output: {output_path}") print() if removal_log: print(f" Removed entries ({len(removal_log)}):") for line_num, reason, preview in removal_log: print(f" Line {line_num:4d}: {reason}") print(f" Instruction: {preview}...") print() print("=" * 60) print() print(" Review the removed entries above.") print(" If any look wrong, check the source file manually.") print() print(f" Clean corpus ready for re-bake: {output_path}") print() if __name__ == "__main__": if len(sys.argv) < 2: print("Usage: python3 clean_corpus.py [output_jsonl]") print(" Default output: _CLEAN.jsonl") sys.exit(1) input_file = sys.argv[1] if len(sys.argv) >= 3: output_file = sys.argv[2] else: base = Path(input_file) output_file = str(base.parent / (base.stem + "_CLEAN" + base.suffix)) clean_corpus(input_file, output_file)