104 lines
2.9 KiB
Python
Executable File
104 lines
2.9 KiB
Python
Executable File
import os
|
|
import json
|
|
from collections import Counter, defaultdict
|
|
|
|
# Folder containing your JSON or NDJSON chunks
|
|
CHUNKS_FOLDER = "./chunks"
|
|
OUTPUT_FOLDER = "./summaries"
|
|
|
|
os.makedirs(OUTPUT_FOLDER, exist_ok=True)
|
|
|
|
def load_chunk(path):
|
|
"""Load either JSON array or NDJSON lines."""
|
|
with open(path, "r") as f:
|
|
first_char = f.read(1)
|
|
f.seek(0)
|
|
|
|
if first_char == "[":
|
|
# Standard JSON array
|
|
return json.load(f)
|
|
else:
|
|
# NDJSON (one JSON object per line)
|
|
return [json.loads(line) for line in f if line.strip()]
|
|
|
|
def summarize_chunk(records):
|
|
datasets = Counter()
|
|
categories = Counter()
|
|
extensions = Counter()
|
|
notable = defaultdict(list)
|
|
|
|
for r in records:
|
|
dataset = r.get("dataset", "unknown")
|
|
category = r.get("category", "unknown")
|
|
ext = r.get("extension", "").lower()
|
|
fname = r.get("filename", "").lower()
|
|
|
|
datasets[dataset] += 1
|
|
categories[category] += 1
|
|
extensions[ext] += 1
|
|
|
|
# Detect notable patterns
|
|
if fname.startswith("dji"):
|
|
notable["DJI media"].append(r["filename"])
|
|
if category == "tax":
|
|
notable["Tax files"].append(r["filename"])
|
|
if category == "archive":
|
|
notable["Archives"].append(r["filename"])
|
|
if category == "source":
|
|
notable["Source code"].append(r["filename"])
|
|
|
|
# Build Markdown summary
|
|
summary = []
|
|
summary.append("### Chunk Summary\n")
|
|
|
|
summary.append(f"Total files: **{len(records)}**\n")
|
|
|
|
summary.append("#### Datasets:")
|
|
for ds, count in datasets.most_common():
|
|
summary.append(f"- **{ds}**: {count} files")
|
|
summary.append("")
|
|
|
|
summary.append("#### Categories:")
|
|
for cat, count in categories.most_common():
|
|
summary.append(f"- **{cat}**: {count}")
|
|
summary.append("")
|
|
|
|
summary.append("#### Top Extensions:")
|
|
for ext, count in extensions.most_common(10):
|
|
summary.append(f"- `{ext}`: {count}")
|
|
summary.append("")
|
|
|
|
summary.append("#### Notable Patterns:")
|
|
if not notable:
|
|
summary.append("- None detected")
|
|
else:
|
|
for label, files in notable.items():
|
|
summary.append(f"- **{label}**: {len(files)} files")
|
|
summary.append("")
|
|
|
|
return "\n".join(summary)
|
|
|
|
|
|
def main():
|
|
for filename in os.listdir(CHUNKS_FOLDER):
|
|
if not filename.endswith(".json") and not filename.endswith(".ndjson"):
|
|
continue
|
|
|
|
path = os.path.join(CHUNKS_FOLDER, filename)
|
|
print(f"Processing {filename}...")
|
|
|
|
records = load_chunk(path)
|
|
summary_text = summarize_chunk(records)
|
|
|
|
out_name = filename.replace(".json", "").replace(".ndjson", "") + "_summary.md"
|
|
out_path = os.path.join(OUTPUT_FOLDER, out_name)
|
|
|
|
with open(out_path, "w") as out:
|
|
out.write(summary_text)
|
|
|
|
print("\nAll summaries generated in:", OUTPUT_FOLDER)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|