157 lines
5.8 KiB
Python
Executable File
157 lines
5.8 KiB
Python
Executable File
import json
|
||
from collections import Counter, defaultdict
|
||
from pathlib import Path
|
||
|
||
INPUT_FILE = "glacier_files_inventory.json"
|
||
OUTPUT_FILE = "glacier_files_inventory_rich_summary.md"
|
||
|
||
def human_size(n):
|
||
for unit in ["B", "KB", "MB", "GB", "TB"]:
|
||
if n < 1024:
|
||
return f"{n:.1f}{unit}"
|
||
n /= 1024
|
||
return f"{n:.1f}PB"
|
||
|
||
def build_rich_summary(records):
|
||
datasets = Counter()
|
||
categories = Counter()
|
||
extensions = Counter()
|
||
size_per_dataset = Counter()
|
||
size_per_category = Counter()
|
||
ext_per_dataset = defaultdict(Counter)
|
||
cat_per_dataset = defaultdict(Counter)
|
||
dataset_examples = defaultdict(list)
|
||
anomalies = []
|
||
|
||
for r in records:
|
||
ds = r.get("dataset", "unknown")
|
||
cat = r.get("category", "unknown")
|
||
ext = (r.get("extension") or "").lower()
|
||
size = r.get("size_bytes", 0) or 0
|
||
fname = r.get("filename", "")
|
||
path = r.get("path", "")
|
||
|
||
datasets[ds] += 1
|
||
categories[cat] += 1
|
||
extensions[ext] += 1
|
||
size_per_dataset[ds] += size
|
||
size_per_category[cat] += size
|
||
ext_per_dataset[ds][ext] += 1
|
||
cat_per_dataset[ds][cat] += 1
|
||
|
||
# keep a few example files per dataset
|
||
if len(dataset_examples[ds]) < 5:
|
||
dataset_examples[ds].append(path)
|
||
|
||
# simple anomaly heuristics
|
||
if cat == "tax" and "video" in ds.lower():
|
||
anomalies.append(f"Tax file in video dataset: {path}")
|
||
if cat == "source" and ("photo" in ds.lower() or "video" in ds.lower()):
|
||
anomalies.append(f"Source code in media dataset: {path}")
|
||
if cat == "archive" and size > 5 * 1024 * 1024 * 1024:
|
||
anomalies.append(f"Very large archive (>5GB): {path} ({human_size(size)})")
|
||
|
||
lines = []
|
||
total_files = len(records)
|
||
total_size = sum(r.get("size_bytes", 0) or 0 for r in records)
|
||
|
||
lines.append("# GlacierEdge Inventory – Rich Semantic Summary\n")
|
||
lines.append(f"- **Total files:** {total_files}")
|
||
lines.append(f"- **Total size:** {human_size(total_size)}\n")
|
||
|
||
# High-level view
|
||
lines.append("## High-level overview\n")
|
||
lines.append("This inventory describes your GlacierEdge archive in terms of datasets, categories, extensions, and a few heuristic anomalies. It is meant to give an LLM enough semantic context to propose better folder structures, security groupings, and cleanup strategies.\n")
|
||
|
||
# Per-dataset narrative
|
||
lines.append("## Per-dataset profiles\n")
|
||
for ds, count in datasets.most_common():
|
||
ds_size = human_size(size_per_dataset[ds])
|
||
top_cats = ", ".join(
|
||
f"{c} ({n})" for c, n in cat_per_dataset[ds].most_common(5)
|
||
) or "none"
|
||
top_exts = ", ".join(
|
||
f"{e or 'no_ext'} ({n})" for e, n in ext_per_dataset[ds].most_common(5)
|
||
) or "none"
|
||
|
||
lines.append(f"### Dataset: {ds}\n")
|
||
lines.append(f"- Approximate files: **{count}**")
|
||
lines.append(f"- Approximate size: **{ds_size}**")
|
||
lines.append(f"- Dominant categories: {top_cats}")
|
||
lines.append(f"- Dominant extensions: {top_exts}")
|
||
|
||
if dataset_examples[ds]:
|
||
lines.append("- Example paths:")
|
||
for ex in dataset_examples[ds]:
|
||
lines.append(f" - `{ex}`")
|
||
lines.append("")
|
||
|
||
# Category-focused view
|
||
lines.append("## Category insights\n")
|
||
for cat, count in categories.most_common():
|
||
cat_size = human_size(size_per_category[cat])
|
||
lines.append(f"### Category: {cat}\n")
|
||
lines.append(f"- Files: **{count}**")
|
||
lines.append(f"- Total size: **{cat_size}**")
|
||
|
||
# where this category lives
|
||
ds_dist = [
|
||
(ds, n) for ds, n in ((d, cat_per_dataset[d][cat]) for d in datasets)
|
||
if n > 0
|
||
]
|
||
ds_dist.sort(key=lambda x: x[1], reverse=True)
|
||
if ds_dist:
|
||
lines.append("- Main datasets for this category:")
|
||
for ds, n in ds_dist[:5]:
|
||
lines.append(f" - {ds}: {n} files")
|
||
else:
|
||
lines.append("- No dataset distribution available.")
|
||
|
||
lines.append("")
|
||
|
||
# Extension-focused view
|
||
lines.append("## Extension landscape\n")
|
||
lines.append("These are the most common extensions and how prevalent they are.\n")
|
||
for ext, count in extensions.most_common(40):
|
||
label = ext or "no_extension"
|
||
lines.append(f"- `{label}`: {count} files")
|
||
lines.append("")
|
||
|
||
# Anomalies and reorg hints
|
||
lines.append("## Anomalies and reorganization hints\n")
|
||
if not anomalies:
|
||
lines.append("- No obvious anomalies detected by simple heuristics.\n")
|
||
else:
|
||
lines.append("The following items may indicate misplaced files, risky storage, or opportunities for reorganization:\n")
|
||
for a in anomalies[:200]:
|
||
lines.append(f"- {a}")
|
||
if len(anomalies) > 200:
|
||
lines.append(f"- ...and {len(anomalies) - 200} more potential anomalies.\n")
|
||
|
||
# Guidance for the LLM
|
||
lines.append("## How to use this summary (for an LLM)\n")
|
||
lines.append(
|
||
"Use this summary to:\n"
|
||
"- Propose a cleaner, more logical folder structure for GlacierEdge.\n"
|
||
"- Suggest which datasets should hold tax, personal, or sensitive files.\n"
|
||
"- Identify which datasets look like long-term archives vs. working sets.\n"
|
||
"- Recommend which categories or extensions should be moved to cold storage.\n"
|
||
"- Highlight where source code, archives, or tax files are stored in unexpected places.\n"
|
||
)
|
||
|
||
return "\n".join(lines)
|
||
|
||
def main():
|
||
with open(INPUT_FILE, "r") as f:
|
||
records = json.load(f)
|
||
|
||
summary_text = build_rich_summary(records)
|
||
|
||
with open(OUTPUT_FILE, "w") as out:
|
||
out.write(summary_text)
|
||
|
||
print(f"\n✅ Rich semantic summary saved to: {OUTPUT_FILE}")
|
||
|
||
if __name__ == "__main__":
|
||
main()
|