CryoLens/summarize_chunks3.py

import json
from collections import Counter, defaultdict
from pathlib import Path

INPUT_FILE = "glacier_files_inventory.json"
OUTPUT_FILE = "glacier_files_inventory_rich_summary.md"

def human_size(n):
    for unit in ["B", "KB", "MB", "GB", "TB"]:
        if n < 1024:
            return f"{n:.1f}{unit}"
        n /= 1024
    return f"{n:.1f}PB"

def build_rich_summary(records):
    datasets = Counter()
    categories = Counter()
    extensions = Counter()
    size_per_dataset = Counter()
    size_per_category = Counter()
    ext_per_dataset = defaultdict(Counter)
    cat_per_dataset = defaultdict(Counter)
    dataset_examples = defaultdict(list)
    anomalies = []

    for r in records:
        ds = r.get("dataset", "unknown")
        cat = r.get("category", "unknown")
        ext = (r.get("extension") or "").lower()
        size = r.get("size_bytes", 0) or 0
        fname = r.get("filename", "")
        path = r.get("path", "")

        datasets[ds] += 1
        categories[cat] += 1
        extensions[ext] += 1
        size_per_dataset[ds] += size
        size_per_category[cat] += size
        ext_per_dataset[ds][ext] += 1
        cat_per_dataset[ds][cat] += 1

        # keep a few example files per dataset
        if len(dataset_examples[ds]) < 5:
            dataset_examples[ds].append(path)

        # simple anomaly heuristics
        if cat == "tax" and "video" in ds.lower():
            anomalies.append(f"Tax file in video dataset: {path}")
        if cat == "source" and ("photo" in ds.lower() or "video" in ds.lower()):
            anomalies.append(f"Source code in media dataset: {path}")
        if cat == "archive" and size > 5 * 1024 * 1024 * 1024:
            anomalies.append(f"Very large archive (>5GB): {path} ({human_size(size)})")

    lines = []
    total_files = len(records)
    total_size = sum(r.get("size_bytes", 0) or 0 for r in records)

    lines.append("# GlacierEdge Inventory – Rich Semantic Summary\n")
    lines.append(f"- **Total files:** {total_files}")
    lines.append(f"- **Total size:** {human_size(total_size)}\n")

    # High-level view
    lines.append("## High-level overview\n")
    lines.append("This inventory describes your GlacierEdge archive in terms of datasets, categories, extensions, and a few heuristic anomalies. It is meant to give an LLM enough semantic context to propose better folder structures, security groupings, and cleanup strategies.\n")

    # Per-dataset narrative
    lines.append("## Per-dataset profiles\n")
    for ds, count in datasets.most_common():
        ds_size = human_size(size_per_dataset[ds])
        top_cats = ", ".join(
            f"{c} ({n})" for c, n in cat_per_dataset[ds].most_common(5)
        ) or "none"
        top_exts = ", ".join(
            f"{e or 'no_ext'} ({n})" for e, n in ext_per_dataset[ds].most_common(5)
        ) or "none"

        lines.append(f"### Dataset: {ds}\n")
        lines.append(f"- Approximate files: **{count}**")
        lines.append(f"- Approximate size: **{ds_size}**")
        lines.append(f"- Dominant categories: {top_cats}")
        lines.append(f"- Dominant extensions: {top_exts}")

        if dataset_examples[ds]:
            lines.append("- Example paths:")
            for ex in dataset_examples[ds]:
                lines.append(f" - `{ex}`")
        lines.append("")

    # Category-focused view
    lines.append("## Category insights\n")
    for cat, count in categories.most_common():
        cat_size = human_size(size_per_category[cat])
        lines.append(f"### Category: {cat}\n")
        lines.append(f"- Files: **{count}**")
        lines.append(f"- Total size: **{cat_size}**")

        # where this category lives
        ds_dist = [
            (ds, n) for ds, n in ((d, cat_per_dataset[d][cat]) for d in datasets)
            if n > 0
        ]
        ds_dist.sort(key=lambda x: x[1], reverse=True)
        if ds_dist:
            lines.append("- Main datasets for this category:")
            for ds, n in ds_dist[:5]:
                lines.append(f" - {ds}: {n} files")
        else:
            lines.append("- No dataset distribution available.")

        lines.append("")

    # Extension-focused view
    lines.append("## Extension landscape\n")
    lines.append("These are the most common extensions and how prevalent they are.\n")
    for ext, count in extensions.most_common(40):
        label = ext or "no_extension"
        lines.append(f"- `{label}`: {count} files")
    lines.append("")

    # Anomalies and reorg hints
    lines.append("## Anomalies and reorganization hints\n")
    if not anomalies:
        lines.append("- No obvious anomalies detected by simple heuristics.\n")
    else:
        lines.append("The following items may indicate misplaced files, risky storage, or opportunities for reorganization:\n")
        for a in anomalies[:200]:
            lines.append(f"- {a}")
        if len(anomalies) > 200:
            lines.append(f"- ...and {len(anomalies) - 200} more potential anomalies.\n")

    # Guidance for the LLM
    lines.append("## How to use this summary (for an LLM)\n")
    lines.append(
        "Use this summary to:\n"
        "- Propose a cleaner, more logical folder structure for GlacierEdge.\n"
        "- Suggest which datasets should hold tax, personal, or sensitive files.\n"
        "- Identify which datasets look like long-term archives vs. working sets.\n"
        "- Recommend which categories or extensions should be moved to cold storage.\n"
        "- Highlight where source code, archives, or tax files are stored in unexpected places.\n"
    )

    return "\n".join(lines)

def main():
    with open(INPUT_FILE, "r") as f:
        records = json.load(f)

    summary_text = build_rich_summary(records)

    with open(OUTPUT_FILE, "w") as out:
        out.write(summary_text)

    print(f"\n✅ Rich semantic summary saved to: {OUTPUT_FILE}")

if __name__ == "__main__":
    main()