CryoLens/summarize_chunks2.py

import json
from collections import Counter, defaultdict

INPUT_FILE = "glacier_files_inventory.json"
OUTPUT_FILE = "glacier_files_inventory_summary.md"

def summarize(records):
    datasets = Counter()
    categories = Counter()
    extensions = Counter()
    notable = defaultdict(list)

    for r in records:
        dataset = r.get("dataset", "unknown")
        category = r.get("category", "unknown")
        ext = r.get("extension", "").lower()
        fname = r.get("filename", "").lower()

        datasets[dataset] += 1
        categories[category] += 1
        extensions[ext] += 1

        # Detect notable patterns
        if fname.startswith("dji"):
            notable["DJI media"].append(fname)
        if category == "tax":
            notable["Tax files"].append(fname)
        if category == "archive":
            notable["Archives"].append(fname)
        if category == "source":
            notable["Source code"].append(fname)

    summary = []
    summary.append("# GlacierEdge Inventory Summary\n")
    summary.append(f"Total files indexed: **{len(records)}**\n")

    summary.append("## Datasets")
    for ds, count in datasets.most_common():
        summary.append(f"- **{ds}**: {count} files")
    summary.append("")

    summary.append("## Categories")
    for cat, count in categories.most_common():
        summary.append(f"- **{cat}**: {count}")
    summary.append("")

    summary.append("## Top Extensions")
    for ext, count in extensions.most_common(20):
        summary.append(f"- `{ext}`: {count}")
    summary.append("")

    summary.append("## Notable Patterns")
    if not notable:
        summary.append("- None detected")
    else:
        for label, files in notable.items():
            summary.append(f"- **{label}**: {len(files)} files")
    summary.append("")

    return "\n".join(summary)

def main():
    with open(INPUT_FILE, "r") as f:
        records = json.load(f)

    summary_text = summarize(records)

    with open(OUTPUT_FILE, "w") as out:
        out.write(summary_text)

    print(f"\n✅ Summary saved to: {OUTPUT_FILE}")

if __name__ == "__main__":
    main()