Files
CryoLens/summarize_chunks2.py
2025-12-26 01:40:13 -05:00

75 lines
2.1 KiB
Python
Executable File

import json
from collections import Counter, defaultdict
INPUT_FILE = "glacier_files_inventory.json"
OUTPUT_FILE = "glacier_files_inventory_summary.md"
def summarize(records):
datasets = Counter()
categories = Counter()
extensions = Counter()
notable = defaultdict(list)
for r in records:
dataset = r.get("dataset", "unknown")
category = r.get("category", "unknown")
ext = r.get("extension", "").lower()
fname = r.get("filename", "").lower()
datasets[dataset] += 1
categories[category] += 1
extensions[ext] += 1
# Detect notable patterns
if fname.startswith("dji"):
notable["DJI media"].append(fname)
if category == "tax":
notable["Tax files"].append(fname)
if category == "archive":
notable["Archives"].append(fname)
if category == "source":
notable["Source code"].append(fname)
summary = []
summary.append("# GlacierEdge Inventory Summary\n")
summary.append(f"Total files indexed: **{len(records)}**\n")
summary.append("## Datasets")
for ds, count in datasets.most_common():
summary.append(f"- **{ds}**: {count} files")
summary.append("")
summary.append("## Categories")
for cat, count in categories.most_common():
summary.append(f"- **{cat}**: {count}")
summary.append("")
summary.append("## Top Extensions")
for ext, count in extensions.most_common(20):
summary.append(f"- `{ext}`: {count}")
summary.append("")
summary.append("## Notable Patterns")
if not notable:
summary.append("- None detected")
else:
for label, files in notable.items():
summary.append(f"- **{label}**: {len(files)} files")
summary.append("")
return "\n".join(summary)
def main():
with open(INPUT_FILE, "r") as f:
records = json.load(f)
summary_text = summarize(records)
with open(OUTPUT_FILE, "w") as out:
out.write(summary_text)
print(f"\n✅ Summary saved to: {OUTPUT_FILE}")
if __name__ == "__main__":
main()