import json from collections import Counter, defaultdict from pathlib import Path INPUT_FILE = "glacier_files_inventory.json" OUTPUT_FILE = "glacier_files_inventory_rich_summary.md" def human_size(n): for unit in ["B", "KB", "MB", "GB", "TB"]: if n < 1024: return f"{n:.1f}{unit}" n /= 1024 return f"{n:.1f}PB" def build_rich_summary(records): datasets = Counter() categories = Counter() extensions = Counter() size_per_dataset = Counter() size_per_category = Counter() ext_per_dataset = defaultdict(Counter) cat_per_dataset = defaultdict(Counter) dataset_examples = defaultdict(list) anomalies = [] for r in records: ds = r.get("dataset", "unknown") cat = r.get("category", "unknown") ext = (r.get("extension") or "").lower() size = r.get("size_bytes", 0) or 0 fname = r.get("filename", "") path = r.get("path", "") datasets[ds] += 1 categories[cat] += 1 extensions[ext] += 1 size_per_dataset[ds] += size size_per_category[cat] += size ext_per_dataset[ds][ext] += 1 cat_per_dataset[ds][cat] += 1 # keep a few example files per dataset if len(dataset_examples[ds]) < 5: dataset_examples[ds].append(path) # simple anomaly heuristics if cat == "tax" and "video" in ds.lower(): anomalies.append(f"Tax file in video dataset: {path}") if cat == "source" and ("photo" in ds.lower() or "video" in ds.lower()): anomalies.append(f"Source code in media dataset: {path}") if cat == "archive" and size > 5 * 1024 * 1024 * 1024: anomalies.append(f"Very large archive (>5GB): {path} ({human_size(size)})") lines = [] total_files = len(records) total_size = sum(r.get("size_bytes", 0) or 0 for r in records) lines.append("# GlacierEdge Inventory – Rich Semantic Summary\n") lines.append(f"- **Total files:** {total_files}") lines.append(f"- **Total size:** {human_size(total_size)}\n") # High-level view lines.append("## High-level overview\n") lines.append("This inventory describes your GlacierEdge archive in terms of datasets, categories, extensions, and a few heuristic anomalies. It is meant to give an LLM enough semantic context to propose better folder structures, security groupings, and cleanup strategies.\n") # Per-dataset narrative lines.append("## Per-dataset profiles\n") for ds, count in datasets.most_common(): ds_size = human_size(size_per_dataset[ds]) top_cats = ", ".join( f"{c} ({n})" for c, n in cat_per_dataset[ds].most_common(5) ) or "none" top_exts = ", ".join( f"{e or 'no_ext'} ({n})" for e, n in ext_per_dataset[ds].most_common(5) ) or "none" lines.append(f"### Dataset: {ds}\n") lines.append(f"- Approximate files: **{count}**") lines.append(f"- Approximate size: **{ds_size}**") lines.append(f"- Dominant categories: {top_cats}") lines.append(f"- Dominant extensions: {top_exts}") if dataset_examples[ds]: lines.append("- Example paths:") for ex in dataset_examples[ds]: lines.append(f" - `{ex}`") lines.append("") # Category-focused view lines.append("## Category insights\n") for cat, count in categories.most_common(): cat_size = human_size(size_per_category[cat]) lines.append(f"### Category: {cat}\n") lines.append(f"- Files: **{count}**") lines.append(f"- Total size: **{cat_size}**") # where this category lives ds_dist = [ (ds, n) for ds, n in ((d, cat_per_dataset[d][cat]) for d in datasets) if n > 0 ] ds_dist.sort(key=lambda x: x[1], reverse=True) if ds_dist: lines.append("- Main datasets for this category:") for ds, n in ds_dist[:5]: lines.append(f" - {ds}: {n} files") else: lines.append("- No dataset distribution available.") lines.append("") # Extension-focused view lines.append("## Extension landscape\n") lines.append("These are the most common extensions and how prevalent they are.\n") for ext, count in extensions.most_common(40): label = ext or "no_extension" lines.append(f"- `{label}`: {count} files") lines.append("") # Anomalies and reorg hints lines.append("## Anomalies and reorganization hints\n") if not anomalies: lines.append("- No obvious anomalies detected by simple heuristics.\n") else: lines.append("The following items may indicate misplaced files, risky storage, or opportunities for reorganization:\n") for a in anomalies[:200]: lines.append(f"- {a}") if len(anomalies) > 200: lines.append(f"- ...and {len(anomalies) - 200} more potential anomalies.\n") # Guidance for the LLM lines.append("## How to use this summary (for an LLM)\n") lines.append( "Use this summary to:\n" "- Propose a cleaner, more logical folder structure for GlacierEdge.\n" "- Suggest which datasets should hold tax, personal, or sensitive files.\n" "- Identify which datasets look like long-term archives vs. working sets.\n" "- Recommend which categories or extensions should be moved to cold storage.\n" "- Highlight where source code, archives, or tax files are stored in unexpected places.\n" ) return "\n".join(lines) def main(): with open(INPUT_FILE, "r") as f: records = json.load(f) summary_text = build_rich_summary(records) with open(OUTPUT_FILE, "w") as out: out.write(summary_text) print(f"\nāœ… Rich semantic summary saved to: {OUTPUT_FILE}") if __name__ == "__main__": main()