Files
CryoLens/summarize_chunks5.py
2025-12-26 01:40:13 -05:00

135 lines
5.1 KiB
Python
Executable File
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import json
from collections import Counter, defaultdict
from datetime import datetime
INPUT_FILE = "glacier_files_inventory.json"
OUTPUT_FILE = "glacier_dataset_summaries.md"
def human_size(n):
for unit in ["B", "KB", "MB", "GB", "TB"]:
if n < 1024:
return f"{n:.1f}{unit}"
n /= 1024
return f"{n:.1f}PB"
def ts(t):
try:
return datetime.fromtimestamp(t).strftime("%Y-%m-%d")
except:
return "unknown"
def main():
with open(INPUT_FILE, "r") as f:
records = json.load(f)
# Group by dataset
by_dataset = defaultdict(list)
for r in records:
ds = r.get("dataset", "unknown")
by_dataset[ds].append(r)
lines = []
lines.append("# GlacierEdge DatasetFocused Semantic Summaries\n")
lines.append("Short, highsignal summaries designed for LLM reasoning about reorganization.\n")
for ds, items in sorted(by_dataset.items(), key=lambda x: -len(x[1])):
count = len(items)
total_size = sum(r.get("size_bytes", 0) or 0 for r in items)
avg_size = total_size / count if count else 0
# Counters
cats = Counter(r.get("category", "unknown") for r in items)
exts = Counter((r.get("extension") or "").lower() for r in items)
# Extension entropy
unique_exts = len(exts)
top_ext, top_ext_count = exts.most_common(1)[0] if exts else ("none", 0)
dominance_ratio = top_ext_count / count if count else 0
if unique_exts > 50:
entropy_label = "Very high (chaotic/mixed-purpose)"
elif unique_exts > 30:
entropy_label = "High (likely mixed content)"
elif unique_exts > 15:
entropy_label = "Moderate"
else:
entropy_label = "Low (coherent dataset)"
# Timestamps
modified_times = [r.get("modified") for r in items if isinstance(r.get("modified"), (int, float))]
oldest = ts(min(modified_times)) if modified_times else "unknown"
newest = ts(max(modified_times)) if modified_times else "unknown"
# Hash coverage
hash_count = sum(1 for r in items if r.get("hash_sha1"))
hash_ratio = hash_count / count if count else 0
# Example paths
examples = [r.get("path", "") for r in items[:5]]
# Build extension details
ext_details = defaultdict(lambda: {"count": 0, "total_size": 0, "cats": Counter()})
for r in items:
ext = (r.get("extension") or "").lower()
size = r.get("size_bytes", 0) or 0
cat = r.get("category", "unknown")
ext_details[ext]["count"] += 1
ext_details[ext]["total_size"] += size
ext_details[ext]["cats"][cat] += 1
# Purpose inference
purpose = []
if "faces" in cats or "scenery" in cats or "objects" in cats:
purpose.append("image/photo collection")
if "document" in cats or "digitized_docs" in cats:
purpose.append("documents or backups")
if "girls" in cats or "adult_video" in cats:
purpose.append("sensitive/private material")
if "source" in cats:
purpose.append("source code or development files")
if "archive" in cats or "iso" in cats:
purpose.append("long-term archival content")
if not purpose:
purpose.append("mixed or unclear")
# Write dataset summary
lines.append(f"## Dataset: {ds}\n")
lines.append(f"- Files: **{count}**")
lines.append(f"- Total size: **{human_size(total_size)}**")
lines.append(f"- Average file size: **{human_size(avg_size)}**")
lines.append(f"- Unique extensions: **{unique_exts}**")
lines.append(f"- Extension entropy: **{entropy_label}**")
lines.append(f"- Dominant extension: `{top_ext}` ({top_ext_count} files, {dominance_ratio:.1%})")
lines.append(f"- Dominant categories: {', '.join(f'{c} ({n})' for c, n in cats.most_common(3))}")
lines.append(f"- Oldest file: {oldest}")
lines.append(f"- Newest file: {newest}")
lines.append(f"- Hash coverage: {hash_ratio:.1%}")
lines.append(f"- Likely purpose: {', '.join(purpose)}\n")
# Extension breakdown table
lines.append("### Extension Breakdown\n")
lines.append("| Extension | Count | Total Size | Avg Size | Dominant Categories |")
lines.append("|----------|--------|------------|----------|----------------------|")
for ext, info in sorted(ext_details.items(), key=lambda x: -x[1]["count"]):
count_e = info["count"]
total_e = info["total_size"]
avg_e = total_e / count_e if count_e else 0
dom_cats = ", ".join(f"{c} ({n})" for c, n in info["cats"].most_common(3))
lines.append(
f"| `{ext or 'no_ext'}` | {count_e} | {human_size(total_e)} | {human_size(avg_e)} | {dom_cats} |"
)
lines.append("\n---\n")
with open(OUTPUT_FILE, "w") as out:
out.write("\n".join(lines))
print(f"\n✅ Dataset-focused summaries saved to: {OUTPUT_FILE}")
if __name__ == "__main__":
main()