Files
CryoLens/summarize_files_7.py
2025-12-26 03:17:02 -05:00

279 lines
10 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import json
from collections import Counter, defaultdict
from datetime import datetime
INPUT_FILE = "glacier_files_inventory.json"
OUTPUT_FILE = "glacier_lens_master_summary.md"
def human_size(n):
for unit in ["B", "KB", "MB", "GB", "TB"]:
if n < 1024:
return f"{n:.1f}{unit}"
n /= 1024
return f"{n:.1f}PB"
def ts(t):
try:
return datetime.fromtimestamp(t).strftime("%Y-%m-%d")
except:
return "unknown"
def main():
with open(INPUT_FILE, "r") as f:
records = json.load(f)
print(f"\n✅ Reading the files inventory from: {INPUT_FILE}")
# GLOBAL COUNTERS
datasets = Counter()
categories = Counter()
extensions = Counter()
size_per_dataset = Counter()
size_per_category = Counter()
ext_per_dataset = defaultdict(Counter)
cat_per_dataset = defaultdict(Counter)
dataset_examples = defaultdict(list)
anomalies = []
# PER-DATASET GROUPING
by_dataset = defaultdict(list)
for r in records:
ds = r.get("dataset", "unknown")
cat = r.get("category", "unknown")
ext = (r.get("extension") or "").lower()
size = r.get("size_bytes", 0) or 0
path = r.get("path", "")
datasets[ds] += 1
categories[cat] += 1
extensions[ext] += 1
size_per_dataset[ds] += size
size_per_category[cat] += size
ext_per_dataset[ds][ext] += 1
cat_per_dataset[ds][cat] += 1
by_dataset[ds].append(r)
if len(dataset_examples[ds]) < 5:
dataset_examples[ds].append(path)
# GLOBAL ANOMALIES
if cat == "tax" and "video" in ds.lower():
anomalies.append(f"Tax file in video dataset: {path}")
if cat == "source" and ("photo" in ds.lower() or "video" in ds.lower()):
anomalies.append(f"Source code in media dataset: {path}")
if cat == "adult_video" and "vault" not in ds.lower():
anomalies.append(f"Adult content outside secure dataset: {path}")
if cat == "archive" and size > 5 * 1024 * 1024 * 1024:
anomalies.append(f"Very large archive (>5GB): {path} ({human_size(size)})")
# BEGIN OUTPUT
lines = []
total_files = len(records)
total_size = sum(r.get("size_bytes", 0) or 0 for r in records)
lines.append("# GlacierLens Master Semantic Summary\n")
lines.append(f"- **Total files:** {total_files}")
lines.append(f"- **Total size:** {human_size(total_size)}\n")
# HIGH-LEVEL OVERVIEW
lines.append("## High-level overview\n")
lines.append(
"This summary provides a global and per-dataset semantic analysis of your GlacierEdge archive. "
"It includes dataset profiles, category insights, extension landscapes, entropy scoring, and anomaly detection. "
"It is designed to give an LLM enough context to propose reorganizations, cleanup strategies, and structural improvements.\n"
)
# GLOBAL DATASET PROFILES
lines.append("## 1. Global Dataset Profiles\n")
for ds, count in datasets.most_common():
ds_size = human_size(size_per_dataset[ds])
top_cats = ", ".join(
f"{c} ({n})" for c, n in cat_per_dataset[ds].most_common(5)
)
top_exts = ", ".join(
f"{e or 'no_ext'} ({n})" for e, n in ext_per_dataset[ds].most_common(5)
)
lines.append(f"### Dataset: {ds}\n")
lines.append(f"- Files: **{count}**")
lines.append(f"- Size: **{ds_size}**")
# lines.append(f"- Dominant categories: {top_cats}")
# lines.append(f"- Dominant extensions: {top_exts}")
# Dominant categories with percentages
cat_strings = []
for c, n in cat_per_dataset[ds].most_common(5):
pct = (n / count) * 100
cat_strings.append(f"{c} ({n}, {pct:.1f}%)")
lines.append(f"- Dominant categories: {', '.join(cat_strings)}")
# Dominant extensions with percentages
ext_strings = []
for e, n in ext_per_dataset[ds].most_common(5):
pct = (n / count) * 100
label = e or "no_ext"
ext_strings.append(f"{label} ({n}, {pct:.1f}%)")
lines.append(f"- Dominant extensions: {', '.join(ext_strings)}")
if dataset_examples[ds]:
lines.append("- Example paths:")
for ex in dataset_examples[ds]:
lines.append(f" - `{ex}`")
lines.append("")
# CATEGORY INSIGHTS
lines.append("## 2. Categories Insights\n")
for cat, count in categories.most_common():
cat_size = human_size(size_per_category[cat])
lines.append(f"### Category: {cat}\n")
lines.append(f"- Files: **{count}**")
lines.append(f"- Total size: **{cat_size}**")
ds_dist = [
(ds, cat_per_dataset[ds][cat])
for ds in datasets
if cat_per_dataset[ds][cat] > 0
]
ds_dist.sort(key=lambda x: x[1], reverse=True)
if ds_dist:
lines.append("- Main datasets for this category:")
for ds, n in ds_dist[:5]:
lines.append(f" - {ds}: {n} files")
lines.append("")
# EXTENSION LANDSCAPE
lines.append("## 3. Extensions Landscape\n")
lines.append("Most common extensions across the entire archive:\n")
for ext, count in extensions.most_common(40):
lines.append(f"- `{ext or 'no_ext'}`: {count} files")
lines.append("")
# GLOBAL ANOMALIES
lines.append("## 4. Global Anomalies and Reorganization Hints\n")
if not anomalies:
lines.append("- No anomalies detected.\n")
else:
for a in anomalies[:200]:
lines.append(f"- {a}")
if len(anomalies) > 200:
lines.append(f"- ...and {len(anomalies) - 200} more.\n")
# PER-DATASET DETAILED SUMMARIES
lines.append("# 5. Detailed per-dataset semantic summaries\n")
for ds, items in sorted(by_dataset.items(), key=lambda x: -len(x[1])):
count = len(items)
total_size = sum(r.get("size_bytes", 0) or 0 for r in items)
avg_size = total_size / count if count else 0
cats = Counter(r.get("category", "unknown") for r in items)
exts = Counter((r.get("extension") or "").lower() for r in items)
# ENTROPY
unique_exts = len(exts)
top_ext, top_ext_count = exts.most_common(1)[0]
dominance_ratio = top_ext_count / count
if unique_exts > 50:
entropy_label = "Very high (chaotic/mixed-purpose)"
elif unique_exts > 30:
entropy_label = "High (likely mixed content)"
elif unique_exts > 15:
entropy_label = "Moderate"
else:
entropy_label = "Low (coherent dataset)"
# TIMESTAMPS
modified_times = [
r.get("modified")
for r in items
if isinstance(r.get("modified"), (int, float))
]
oldest = ts(min(modified_times)) if modified_times else "unknown"
newest = ts(max(modified_times)) if modified_times else "unknown"
# HASH COVERAGE
hash_count = sum(1 for r in items if r.get("hash_sha1"))
hash_ratio = hash_count / count
# EXTENSION DETAILS
ext_details = defaultdict(
lambda: {"count": 0, "total_size": 0, "cats": Counter()}
)
for r in items:
ext = (r.get("extension") or "").lower()
size = r.get("size_bytes", 0) or 0
cat = r.get("category", "unknown")
ext_details[ext]["count"] += 1
ext_details[ext]["total_size"] += size
ext_details[ext]["cats"][cat] += 1
# PURPOSE INFERENCE
purpose = []
if "faces" in cats or "scenery" in cats or "objects" in cats:
purpose.append("image/photo collection")
if "document" in cats or "digitized_docs" in cats:
purpose.append("documents or backups")
if "girls" in cats or "adult_video" in cats:
purpose.append("sensitive/private material")
if "source" in cats:
purpose.append("source code or development files")
if "archive" in cats or "iso" in cats:
purpose.append("long-term archival content")
if not purpose:
purpose.append("mixed or unclear")
# WRITE DATASET SUMMARY
lines.append(f"## Dataset: {ds}\n")
lines.append(f"- Files: **{count}**")
lines.append(f"- Total size: **{human_size(total_size)}**")
lines.append(f"- Average file size: **{human_size(avg_size)}**")
lines.append(f"- Unique extensions: **{unique_exts}**")
lines.append(f"- Extension entropy: **{entropy_label}**")
lines.append(
f"- Dominant extension: `{top_ext}` ({top_ext_count} files, {dominance_ratio:.1%})"
)
lines.append(
f"- Dominant categories: {', '.join(f'{c} ({n})' for c, n in cats.most_common(3))}"
)
lines.append(f"- Oldest file: {oldest}")
lines.append(f"- Newest file: {newest}")
lines.append(f"- Hash coverage: {hash_ratio:.1%}")
lines.append(f"- Likely purpose: {', '.join(purpose)}\n")
# TOP 50 EXTENSIONS
lines.append("### Top 50 Extensions\n")
lines.append(
"| Extension | Count | Total Size | Avg Size | Dominant Categories |"
)
lines.append(
"|----------|--------|------------|----------|----------------------|"
)
for ext, info in sorted(ext_details.items(), key=lambda x: -x[1]["count"])[:50]:
count_e = info["count"]
total_e = info["total_size"]
avg_e = total_e / count_e if count_e else 0
dom_cats = ", ".join(f"{c} ({n})" for c, n in info["cats"].most_common(3))
lines.append(
f"| `{ext or 'no_ext'}` | {count_e} | {human_size(total_e)} | {human_size(avg_e)} | {dom_cats} |"
)
lines.append("\n---\n")
lines.append("\n End of Report \n")
# WRITE FILE
with open(OUTPUT_FILE, "w") as out:
out.write("\n".join(lines))
print(f"\n✅ GlacierLens master summary saved to: {OUTPUT_FILE}")
if __name__ == "__main__":
main()