Files
CryoLens/summarize_chunks.py
2025-12-26 01:40:13 -05:00

104 lines
2.9 KiB
Python
Executable File

import os
import json
from collections import Counter, defaultdict
# Folder containing your JSON or NDJSON chunks
CHUNKS_FOLDER = "./chunks"
OUTPUT_FOLDER = "./summaries"
os.makedirs(OUTPUT_FOLDER, exist_ok=True)
def load_chunk(path):
"""Load either JSON array or NDJSON lines."""
with open(path, "r") as f:
first_char = f.read(1)
f.seek(0)
if first_char == "[":
# Standard JSON array
return json.load(f)
else:
# NDJSON (one JSON object per line)
return [json.loads(line) for line in f if line.strip()]
def summarize_chunk(records):
datasets = Counter()
categories = Counter()
extensions = Counter()
notable = defaultdict(list)
for r in records:
dataset = r.get("dataset", "unknown")
category = r.get("category", "unknown")
ext = r.get("extension", "").lower()
fname = r.get("filename", "").lower()
datasets[dataset] += 1
categories[category] += 1
extensions[ext] += 1
# Detect notable patterns
if fname.startswith("dji"):
notable["DJI media"].append(r["filename"])
if category == "tax":
notable["Tax files"].append(r["filename"])
if category == "archive":
notable["Archives"].append(r["filename"])
if category == "source":
notable["Source code"].append(r["filename"])
# Build Markdown summary
summary = []
summary.append("### Chunk Summary\n")
summary.append(f"Total files: **{len(records)}**\n")
summary.append("#### Datasets:")
for ds, count in datasets.most_common():
summary.append(f"- **{ds}**: {count} files")
summary.append("")
summary.append("#### Categories:")
for cat, count in categories.most_common():
summary.append(f"- **{cat}**: {count}")
summary.append("")
summary.append("#### Top Extensions:")
for ext, count in extensions.most_common(10):
summary.append(f"- `{ext}`: {count}")
summary.append("")
summary.append("#### Notable Patterns:")
if not notable:
summary.append("- None detected")
else:
for label, files in notable.items():
summary.append(f"- **{label}**: {len(files)} files")
summary.append("")
return "\n".join(summary)
def main():
for filename in os.listdir(CHUNKS_FOLDER):
if not filename.endswith(".json") and not filename.endswith(".ndjson"):
continue
path = os.path.join(CHUNKS_FOLDER, filename)
print(f"Processing {filename}...")
records = load_chunk(path)
summary_text = summarize_chunk(records)
out_name = filename.replace(".json", "").replace(".ndjson", "") + "_summary.md"
out_path = os.path.join(OUTPUT_FOLDER, out_name)
with open(out_path, "w") as out:
out.write(summary_text)
print("\nAll summaries generated in:", OUTPUT_FOLDER)
if __name__ == "__main__":
main()