279 lines
10 KiB
Python
279 lines
10 KiB
Python
import json
|
||
from collections import Counter, defaultdict
|
||
from datetime import datetime
|
||
|
||
INPUT_FILE = "glacier_files_inventory.json"
|
||
OUTPUT_FILE = "glacier_lens_master_summary.md"
|
||
|
||
|
||
def human_size(n):
|
||
for unit in ["B", "KB", "MB", "GB", "TB"]:
|
||
if n < 1024:
|
||
return f"{n:.1f}{unit}"
|
||
n /= 1024
|
||
return f"{n:.1f}PB"
|
||
|
||
|
||
def ts(t):
|
||
try:
|
||
return datetime.fromtimestamp(t).strftime("%Y-%m-%d")
|
||
except:
|
||
return "unknown"
|
||
|
||
|
||
def main():
|
||
with open(INPUT_FILE, "r") as f:
|
||
records = json.load(f)
|
||
|
||
print(f"\n✅ Reading the files inventory from: {INPUT_FILE}")
|
||
|
||
# GLOBAL COUNTERS
|
||
datasets = Counter()
|
||
categories = Counter()
|
||
extensions = Counter()
|
||
size_per_dataset = Counter()
|
||
size_per_category = Counter()
|
||
ext_per_dataset = defaultdict(Counter)
|
||
cat_per_dataset = defaultdict(Counter)
|
||
dataset_examples = defaultdict(list)
|
||
anomalies = []
|
||
|
||
# PER-DATASET GROUPING
|
||
by_dataset = defaultdict(list)
|
||
|
||
for r in records:
|
||
ds = r.get("dataset", "unknown")
|
||
cat = r.get("category", "unknown")
|
||
ext = (r.get("extension") or "").lower()
|
||
size = r.get("size_bytes", 0) or 0
|
||
path = r.get("path", "")
|
||
|
||
datasets[ds] += 1
|
||
categories[cat] += 1
|
||
extensions[ext] += 1
|
||
size_per_dataset[ds] += size
|
||
size_per_category[cat] += size
|
||
ext_per_dataset[ds][ext] += 1
|
||
cat_per_dataset[ds][cat] += 1
|
||
by_dataset[ds].append(r)
|
||
|
||
if len(dataset_examples[ds]) < 5:
|
||
dataset_examples[ds].append(path)
|
||
|
||
# GLOBAL ANOMALIES
|
||
if cat == "tax" and "video" in ds.lower():
|
||
anomalies.append(f"Tax file in video dataset: {path}")
|
||
if cat == "source" and ("photo" in ds.lower() or "video" in ds.lower()):
|
||
anomalies.append(f"Source code in media dataset: {path}")
|
||
if cat == "adult_video" and "vault" not in ds.lower():
|
||
anomalies.append(f"Adult content outside secure dataset: {path}")
|
||
if cat == "archive" and size > 5 * 1024 * 1024 * 1024:
|
||
anomalies.append(f"Very large archive (>5GB): {path} ({human_size(size)})")
|
||
|
||
# BEGIN OUTPUT
|
||
lines = []
|
||
total_files = len(records)
|
||
total_size = sum(r.get("size_bytes", 0) or 0 for r in records)
|
||
|
||
lines.append("# GlacierLens – Master Semantic Summary\n")
|
||
lines.append(f"- **Total files:** {total_files}")
|
||
lines.append(f"- **Total size:** {human_size(total_size)}\n")
|
||
|
||
# HIGH-LEVEL OVERVIEW
|
||
lines.append("## High-level overview\n")
|
||
lines.append(
|
||
"This summary provides a global and per-dataset semantic analysis of your GlacierEdge archive. "
|
||
"It includes dataset profiles, category insights, extension landscapes, entropy scoring, and anomaly detection. "
|
||
"It is designed to give an LLM enough context to propose reorganizations, cleanup strategies, and structural improvements.\n"
|
||
)
|
||
|
||
# GLOBAL DATASET PROFILES
|
||
lines.append("## 1. Global Dataset Profiles\n")
|
||
for ds, count in datasets.most_common():
|
||
ds_size = human_size(size_per_dataset[ds])
|
||
top_cats = ", ".join(
|
||
f"{c} ({n})" for c, n in cat_per_dataset[ds].most_common(5)
|
||
)
|
||
top_exts = ", ".join(
|
||
f"{e or 'no_ext'} ({n})" for e, n in ext_per_dataset[ds].most_common(5)
|
||
)
|
||
|
||
lines.append(f"### Dataset: {ds}\n")
|
||
lines.append(f"- Files: **{count}**")
|
||
lines.append(f"- Size: **{ds_size}**")
|
||
# lines.append(f"- Dominant categories: {top_cats}")
|
||
# lines.append(f"- Dominant extensions: {top_exts}")
|
||
|
||
# Dominant categories with percentages
|
||
cat_strings = []
|
||
for c, n in cat_per_dataset[ds].most_common(5):
|
||
pct = (n / count) * 100
|
||
cat_strings.append(f"{c} ({n}, {pct:.1f}%)")
|
||
lines.append(f"- Dominant categories: {', '.join(cat_strings)}")
|
||
# Dominant extensions with percentages
|
||
ext_strings = []
|
||
for e, n in ext_per_dataset[ds].most_common(5):
|
||
pct = (n / count) * 100
|
||
label = e or "no_ext"
|
||
ext_strings.append(f"{label} ({n}, {pct:.1f}%)")
|
||
lines.append(f"- Dominant extensions: {', '.join(ext_strings)}")
|
||
|
||
if dataset_examples[ds]:
|
||
lines.append("- Example paths:")
|
||
for ex in dataset_examples[ds]:
|
||
lines.append(f" - `{ex}`")
|
||
lines.append("")
|
||
|
||
# CATEGORY INSIGHTS
|
||
lines.append("## 2. Categories Insights\n")
|
||
for cat, count in categories.most_common():
|
||
cat_size = human_size(size_per_category[cat])
|
||
lines.append(f"### Category: {cat}\n")
|
||
lines.append(f"- Files: **{count}**")
|
||
lines.append(f"- Total size: **{cat_size}**")
|
||
|
||
ds_dist = [
|
||
(ds, cat_per_dataset[ds][cat])
|
||
for ds in datasets
|
||
if cat_per_dataset[ds][cat] > 0
|
||
]
|
||
ds_dist.sort(key=lambda x: x[1], reverse=True)
|
||
|
||
if ds_dist:
|
||
lines.append("- Main datasets for this category:")
|
||
for ds, n in ds_dist[:5]:
|
||
lines.append(f" - {ds}: {n} files")
|
||
lines.append("")
|
||
|
||
# EXTENSION LANDSCAPE
|
||
lines.append("## 3. Extensions Landscape\n")
|
||
lines.append("Most common extensions across the entire archive:\n")
|
||
for ext, count in extensions.most_common(40):
|
||
lines.append(f"- `{ext or 'no_ext'}`: {count} files")
|
||
lines.append("")
|
||
|
||
# GLOBAL ANOMALIES
|
||
lines.append("## 4. Global Anomalies and Reorganization Hints\n")
|
||
if not anomalies:
|
||
lines.append("- No anomalies detected.\n")
|
||
else:
|
||
for a in anomalies[:200]:
|
||
lines.append(f"- {a}")
|
||
if len(anomalies) > 200:
|
||
lines.append(f"- ...and {len(anomalies) - 200} more.\n")
|
||
|
||
# PER-DATASET DETAILED SUMMARIES
|
||
lines.append("# 5. Detailed per-dataset semantic summaries\n")
|
||
|
||
for ds, items in sorted(by_dataset.items(), key=lambda x: -len(x[1])):
|
||
count = len(items)
|
||
total_size = sum(r.get("size_bytes", 0) or 0 for r in items)
|
||
avg_size = total_size / count if count else 0
|
||
|
||
cats = Counter(r.get("category", "unknown") for r in items)
|
||
exts = Counter((r.get("extension") or "").lower() for r in items)
|
||
|
||
# ENTROPY
|
||
unique_exts = len(exts)
|
||
top_ext, top_ext_count = exts.most_common(1)[0]
|
||
dominance_ratio = top_ext_count / count
|
||
|
||
if unique_exts > 50:
|
||
entropy_label = "Very high (chaotic/mixed-purpose)"
|
||
elif unique_exts > 30:
|
||
entropy_label = "High (likely mixed content)"
|
||
elif unique_exts > 15:
|
||
entropy_label = "Moderate"
|
||
else:
|
||
entropy_label = "Low (coherent dataset)"
|
||
|
||
# TIMESTAMPS
|
||
modified_times = [
|
||
r.get("modified")
|
||
for r in items
|
||
if isinstance(r.get("modified"), (int, float))
|
||
]
|
||
oldest = ts(min(modified_times)) if modified_times else "unknown"
|
||
newest = ts(max(modified_times)) if modified_times else "unknown"
|
||
|
||
# HASH COVERAGE
|
||
hash_count = sum(1 for r in items if r.get("hash_sha1"))
|
||
hash_ratio = hash_count / count
|
||
|
||
# EXTENSION DETAILS
|
||
ext_details = defaultdict(
|
||
lambda: {"count": 0, "total_size": 0, "cats": Counter()}
|
||
)
|
||
for r in items:
|
||
ext = (r.get("extension") or "").lower()
|
||
size = r.get("size_bytes", 0) or 0
|
||
cat = r.get("category", "unknown")
|
||
ext_details[ext]["count"] += 1
|
||
ext_details[ext]["total_size"] += size
|
||
ext_details[ext]["cats"][cat] += 1
|
||
|
||
# PURPOSE INFERENCE
|
||
purpose = []
|
||
if "faces" in cats or "scenery" in cats or "objects" in cats:
|
||
purpose.append("image/photo collection")
|
||
if "document" in cats or "digitized_docs" in cats:
|
||
purpose.append("documents or backups")
|
||
if "girls" in cats or "adult_video" in cats:
|
||
purpose.append("sensitive/private material")
|
||
if "source" in cats:
|
||
purpose.append("source code or development files")
|
||
if "archive" in cats or "iso" in cats:
|
||
purpose.append("long-term archival content")
|
||
if not purpose:
|
||
purpose.append("mixed or unclear")
|
||
|
||
# WRITE DATASET SUMMARY
|
||
lines.append(f"## Dataset: {ds}\n")
|
||
lines.append(f"- Files: **{count}**")
|
||
lines.append(f"- Total size: **{human_size(total_size)}**")
|
||
lines.append(f"- Average file size: **{human_size(avg_size)}**")
|
||
lines.append(f"- Unique extensions: **{unique_exts}**")
|
||
lines.append(f"- Extension entropy: **{entropy_label}**")
|
||
lines.append(
|
||
f"- Dominant extension: `{top_ext}` ({top_ext_count} files, {dominance_ratio:.1%})"
|
||
)
|
||
lines.append(
|
||
f"- Dominant categories: {', '.join(f'{c} ({n})' for c, n in cats.most_common(3))}"
|
||
)
|
||
lines.append(f"- Oldest file: {oldest}")
|
||
lines.append(f"- Newest file: {newest}")
|
||
lines.append(f"- Hash coverage: {hash_ratio:.1%}")
|
||
lines.append(f"- Likely purpose: {', '.join(purpose)}\n")
|
||
|
||
# TOP 50 EXTENSIONS
|
||
lines.append("### Top 50 Extensions\n")
|
||
lines.append(
|
||
"| Extension | Count | Total Size | Avg Size | Dominant Categories |"
|
||
)
|
||
lines.append(
|
||
"|----------|--------|------------|----------|----------------------|"
|
||
)
|
||
|
||
for ext, info in sorted(ext_details.items(), key=lambda x: -x[1]["count"])[:50]:
|
||
count_e = info["count"]
|
||
total_e = info["total_size"]
|
||
avg_e = total_e / count_e if count_e else 0
|
||
dom_cats = ", ".join(f"{c} ({n})" for c, n in info["cats"].most_common(3))
|
||
|
||
lines.append(
|
||
f"| `{ext or 'no_ext'}` | {count_e} | {human_size(total_e)} | {human_size(avg_e)} | {dom_cats} |"
|
||
)
|
||
|
||
lines.append("\n---\n")
|
||
|
||
lines.append("\n End of Report \n")
|
||
# WRITE FILE
|
||
with open(OUTPUT_FILE, "w") as out:
|
||
out.write("\n".join(lines))
|
||
|
||
print(f"\n✅ GlacierLens master summary saved to: {OUTPUT_FILE}")
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|