import os import json from collections import Counter, defaultdict # Folder containing your JSON or NDJSON chunks CHUNKS_FOLDER = "./chunks" OUTPUT_FOLDER = "./summaries" os.makedirs(OUTPUT_FOLDER, exist_ok=True) def load_chunk(path): """Load either JSON array or NDJSON lines.""" with open(path, "r") as f: first_char = f.read(1) f.seek(0) if first_char == "[": # Standard JSON array return json.load(f) else: # NDJSON (one JSON object per line) return [json.loads(line) for line in f if line.strip()] def summarize_chunk(records): datasets = Counter() categories = Counter() extensions = Counter() notable = defaultdict(list) for r in records: dataset = r.get("dataset", "unknown") category = r.get("category", "unknown") ext = r.get("extension", "").lower() fname = r.get("filename", "").lower() datasets[dataset] += 1 categories[category] += 1 extensions[ext] += 1 # Detect notable patterns if fname.startswith("dji"): notable["DJI media"].append(r["filename"]) if category == "tax": notable["Tax files"].append(r["filename"]) if category == "archive": notable["Archives"].append(r["filename"]) if category == "source": notable["Source code"].append(r["filename"]) # Build Markdown summary summary = [] summary.append("### Chunk Summary\n") summary.append(f"Total files: **{len(records)}**\n") summary.append("#### Datasets:") for ds, count in datasets.most_common(): summary.append(f"- **{ds}**: {count} files") summary.append("") summary.append("#### Categories:") for cat, count in categories.most_common(): summary.append(f"- **{cat}**: {count}") summary.append("") summary.append("#### Top Extensions:") for ext, count in extensions.most_common(10): summary.append(f"- `{ext}`: {count}") summary.append("") summary.append("#### Notable Patterns:") if not notable: summary.append("- None detected") else: for label, files in notable.items(): summary.append(f"- **{label}**: {len(files)} files") summary.append("") return "\n".join(summary) def main(): for filename in os.listdir(CHUNKS_FOLDER): if not filename.endswith(".json") and not filename.endswith(".ndjson"): continue path = os.path.join(CHUNKS_FOLDER, filename) print(f"Processing {filename}...") records = load_chunk(path) summary_text = summarize_chunk(records) out_name = filename.replace(".json", "").replace(".ndjson", "") + "_summary.md" out_path = os.path.join(OUTPUT_FOLDER, out_name) with open(out_path, "w") as out: out.write(summary_text) print("\nAll summaries generated in:", OUTPUT_FOLDER) if __name__ == "__main__": main()