Files
CryoLens/summarize_chunks3.py
2025-12-26 01:40:13 -05:00

157 lines
5.8 KiB
Python
Executable File
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import json
from collections import Counter, defaultdict
from pathlib import Path
INPUT_FILE = "glacier_files_inventory.json"
OUTPUT_FILE = "glacier_files_inventory_rich_summary.md"
def human_size(n):
for unit in ["B", "KB", "MB", "GB", "TB"]:
if n < 1024:
return f"{n:.1f}{unit}"
n /= 1024
return f"{n:.1f}PB"
def build_rich_summary(records):
datasets = Counter()
categories = Counter()
extensions = Counter()
size_per_dataset = Counter()
size_per_category = Counter()
ext_per_dataset = defaultdict(Counter)
cat_per_dataset = defaultdict(Counter)
dataset_examples = defaultdict(list)
anomalies = []
for r in records:
ds = r.get("dataset", "unknown")
cat = r.get("category", "unknown")
ext = (r.get("extension") or "").lower()
size = r.get("size_bytes", 0) or 0
fname = r.get("filename", "")
path = r.get("path", "")
datasets[ds] += 1
categories[cat] += 1
extensions[ext] += 1
size_per_dataset[ds] += size
size_per_category[cat] += size
ext_per_dataset[ds][ext] += 1
cat_per_dataset[ds][cat] += 1
# keep a few example files per dataset
if len(dataset_examples[ds]) < 5:
dataset_examples[ds].append(path)
# simple anomaly heuristics
if cat == "tax" and "video" in ds.lower():
anomalies.append(f"Tax file in video dataset: {path}")
if cat == "source" and ("photo" in ds.lower() or "video" in ds.lower()):
anomalies.append(f"Source code in media dataset: {path}")
if cat == "archive" and size > 5 * 1024 * 1024 * 1024:
anomalies.append(f"Very large archive (>5GB): {path} ({human_size(size)})")
lines = []
total_files = len(records)
total_size = sum(r.get("size_bytes", 0) or 0 for r in records)
lines.append("# GlacierEdge Inventory Rich Semantic Summary\n")
lines.append(f"- **Total files:** {total_files}")
lines.append(f"- **Total size:** {human_size(total_size)}\n")
# High-level view
lines.append("## High-level overview\n")
lines.append("This inventory describes your GlacierEdge archive in terms of datasets, categories, extensions, and a few heuristic anomalies. It is meant to give an LLM enough semantic context to propose better folder structures, security groupings, and cleanup strategies.\n")
# Per-dataset narrative
lines.append("## Per-dataset profiles\n")
for ds, count in datasets.most_common():
ds_size = human_size(size_per_dataset[ds])
top_cats = ", ".join(
f"{c} ({n})" for c, n in cat_per_dataset[ds].most_common(5)
) or "none"
top_exts = ", ".join(
f"{e or 'no_ext'} ({n})" for e, n in ext_per_dataset[ds].most_common(5)
) or "none"
lines.append(f"### Dataset: {ds}\n")
lines.append(f"- Approximate files: **{count}**")
lines.append(f"- Approximate size: **{ds_size}**")
lines.append(f"- Dominant categories: {top_cats}")
lines.append(f"- Dominant extensions: {top_exts}")
if dataset_examples[ds]:
lines.append("- Example paths:")
for ex in dataset_examples[ds]:
lines.append(f" - `{ex}`")
lines.append("")
# Category-focused view
lines.append("## Category insights\n")
for cat, count in categories.most_common():
cat_size = human_size(size_per_category[cat])
lines.append(f"### Category: {cat}\n")
lines.append(f"- Files: **{count}**")
lines.append(f"- Total size: **{cat_size}**")
# where this category lives
ds_dist = [
(ds, n) for ds, n in ((d, cat_per_dataset[d][cat]) for d in datasets)
if n > 0
]
ds_dist.sort(key=lambda x: x[1], reverse=True)
if ds_dist:
lines.append("- Main datasets for this category:")
for ds, n in ds_dist[:5]:
lines.append(f" - {ds}: {n} files")
else:
lines.append("- No dataset distribution available.")
lines.append("")
# Extension-focused view
lines.append("## Extension landscape\n")
lines.append("These are the most common extensions and how prevalent they are.\n")
for ext, count in extensions.most_common(40):
label = ext or "no_extension"
lines.append(f"- `{label}`: {count} files")
lines.append("")
# Anomalies and reorg hints
lines.append("## Anomalies and reorganization hints\n")
if not anomalies:
lines.append("- No obvious anomalies detected by simple heuristics.\n")
else:
lines.append("The following items may indicate misplaced files, risky storage, or opportunities for reorganization:\n")
for a in anomalies[:200]:
lines.append(f"- {a}")
if len(anomalies) > 200:
lines.append(f"- ...and {len(anomalies) - 200} more potential anomalies.\n")
# Guidance for the LLM
lines.append("## How to use this summary (for an LLM)\n")
lines.append(
"Use this summary to:\n"
"- Propose a cleaner, more logical folder structure for GlacierEdge.\n"
"- Suggest which datasets should hold tax, personal, or sensitive files.\n"
"- Identify which datasets look like long-term archives vs. working sets.\n"
"- Recommend which categories or extensions should be moved to cold storage.\n"
"- Highlight where source code, archives, or tax files are stored in unexpected places.\n"
)
return "\n".join(lines)
def main():
with open(INPUT_FILE, "r") as f:
records = json.load(f)
summary_text = build_rich_summary(records)
with open(OUTPUT_FILE, "w") as out:
out.write(summary_text)
print(f"\n✅ Rich semantic summary saved to: {OUTPUT_FILE}")
if __name__ == "__main__":
main()