Files
CryoLens/summarize_files_8.py
2025-12-26 03:42:22 -05:00

323 lines
11 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import json
from collections import Counter, defaultdict
from datetime import datetime
INPUT_FILE = "glacier_files_inventory.json"
OUTPUT_FILE = "glacierlens_master_summary.md"
def human_size(n):
for unit in ["B", "KB", "MB", "GB", "TB"]:
if n < 1024:
return f"{n:.1f}{unit}"
n /= 1024
return f"{n:.1f}PB"
def ts(t):
try:
return datetime.fromtimestamp(t).strftime("%Y-%m-%d")
except:
return "unknown"
def extract_root_folder(path):
"""
Extract the first folder after the dataset mount point.
Example: /mnt/bucket1/SA/... -> SA
"""
if not path:
return "unknown"
parts = [p for p in path.split("/") if p]
# Expecting: ["mnt", "bucket1", "SA", ...]
if len(parts) >= 3:
return parts[2]
return "unknown"
def folder_entropy_score(unique_folders):
"""
Simple entropy scoring for folder diversity.
"""
if unique_folders > 200:
return "Very high (chaotic folder structure)"
elif unique_folders > 100:
return "High (mixed folder structure)"
elif unique_folders > 40:
return "Moderate"
else:
return "Low (coherent folder structure)"
def main():
with open(INPUT_FILE, "r") as f:
records = json.load(f)
# GLOBAL COUNTERS
datasets = Counter()
categories = Counter()
extensions = Counter()
size_per_dataset = Counter()
size_per_category = Counter()
ext_per_dataset = defaultdict(Counter)
cat_per_dataset = defaultdict(Counter)
dataset_examples = defaultdict(list)
root_folders_per_dataset = defaultdict(Counter)
anomalies = []
# PER-DATASET GROUPING
by_dataset = defaultdict(list)
for r in records:
ds = r.get("dataset", "unknown")
cat = r.get("category", "unknown")
ext = (r.get("extension") or "").lower()
size = r.get("size_bytes", 0) or 0
path = r.get("path", "")
datasets[ds] += 1
categories[cat] += 1
extensions[ext] += 1
size_per_dataset[ds] += size
size_per_category[cat] += size
ext_per_dataset[ds][ext] += 1
cat_per_dataset[ds][cat] += 1
by_dataset[ds].append(r)
# Root folder extraction
root = extract_root_folder(path)
root_folders_per_dataset[ds][root] += 1
if len(dataset_examples[ds]) < 5:
dataset_examples[ds].append(path)
# GLOBAL ANOMALIES
if cat == "tax" and "video" in ds.lower():
anomalies.append(f"Tax file in video dataset: {path}")
if cat == "source" and ("photo" in ds.lower() or "video" in ds.lower()):
anomalies.append(f"Source code in media dataset: {path}")
if cat == "adult_video" and "vault" not in ds.lower():
anomalies.append(f"Adult content outside secure dataset: {path}")
if cat == "archive" and size > 5 * 1024 * 1024 * 1024:
anomalies.append(f"Very large archive (>5GB): {path} ({human_size(size)})")
# BEGIN OUTPUT
lines = []
total_files = len(records)
total_size = sum(r.get("size_bytes", 0) or 0 for r in records)
lines.append("# GlacierLens Master Semantic Summary\n")
lines.append(f"- **Total files:** {total_files}")
lines.append(f"- **Total size:** {human_size(total_size)}\n")
# HIGH-LEVEL OVERVIEW
lines.append("## High-level overview\n")
lines.append(
"This summary provides a global and per-dataset semantic analysis of your GlacierEdge archive. "
"It includes dataset profiles, category insights, extension landscapes, entropy scoring, folder structure analysis, "
"and anomaly detection.\n"
)
# GLOBAL DATASET PROFILES
lines.append("## 1. Global Dataset Profiles\n")
for ds, count in datasets.most_common():
ds_size = human_size(size_per_dataset[ds])
top_cats = ", ".join(
f"{c} ({n}, {(n / count) * 100:.1f}%)"
for c, n in cat_per_dataset[ds].most_common(5)
)
top_exts = ", ".join(
f"{e or 'no_ext'} ({n}, {(n / count) * 100:.1f}%)"
for e, n in ext_per_dataset[ds].most_common(5)
)
lines.append(f"### Dataset: {ds}\n")
lines.append(f"- Files: **{count}**")
lines.append(f"- Size: **{ds_size}**")
lines.append(f"- Dominant categories: {top_cats}")
lines.append(f"- Dominant extensions: {top_exts}")
# Root folder summary
top_roots = root_folders_per_dataset[ds].most_common(10)
if top_roots:
lines.append("- Top root folders:")
for folder, n in top_roots:
pct = (n / count) * 100
lines.append(f" - `{folder}`: {n} files ({pct:.1f}%)")
if dataset_examples[ds]:
lines.append("- Example paths:")
for ex in dataset_examples[ds]:
lines.append(f" - `{ex}`")
lines.append("")
# CATEGORY INSIGHTS
lines.append("## 2. Categories Insights\n")
for cat, count in categories.most_common():
cat_size = human_size(size_per_category[cat])
lines.append(f"### Category: {cat}\n")
lines.append(f"- Files: **{count}**")
lines.append(f"- Total size: **{cat_size}**")
ds_dist = [
(ds, cat_per_dataset[ds][cat])
for ds in datasets
if cat_per_dataset[ds][cat] > 0
]
ds_dist.sort(key=lambda x: x[1], reverse=True)
if ds_dist:
lines.append("- Main datasets for this category:")
for ds, n in ds_dist[:5]:
lines.append(f" - {ds}: {n} files")
lines.append("")
# EXTENSION LANDSCAPE
lines.append("## 3. Extensions Landscape\n")
lines.append("Most common extensions across the entire archive:\n")
for ext, count in extensions.most_common(40):
lines.append(f"- `{ext or 'no_ext'}`: {count} files")
lines.append("")
# GLOBAL ANOMALIES
lines.append("## 4. Global Anomalies and Reorganization Hints\n")
if not anomalies:
lines.append("- No anomalies detected.\n")
else:
for a in anomalies[:200]:
lines.append(f"- {a}")
if len(anomalies) > 200:
lines.append(f"- ...and {len(anomalies) - 200} more.\n")
# PER-DATASET DETAILED SUMMARIES
lines.append("# 5. Detailed per-dataset semantic summaries\n")
for ds, items in sorted(by_dataset.items(), key=lambda x: -len(x[1])):
count = len(items)
total_size = sum(r.get("size_bytes", 0) or 0 for r in items)
avg_size = total_size / count if count else 0
cats = Counter(r.get("category", "unknown") for r in items)
exts = Counter((r.get("extension") or "").lower() for r in items)
roots = root_folders_per_dataset[ds]
# ENTROPY
unique_exts = len(exts)
unique_roots = len(roots)
top_ext, top_ext_count = exts.most_common(1)[0]
dominance_ratio = top_ext_count / count
if unique_exts > 50:
entropy_label = "Very high (chaotic/mixed-purpose)"
elif unique_exts > 30:
entropy_label = "High (likely mixed content)"
elif unique_exts > 15:
entropy_label = "Moderate"
else:
entropy_label = "Low (coherent dataset)"
folder_entropy = folder_entropy_score(unique_roots)
# TIMESTAMPS
modified_times = [
r.get("modified")
for r in items
if isinstance(r.get("modified"), (int, float))
]
oldest = ts(min(modified_times)) if modified_times else "unknown"
newest = ts(max(modified_times)) if modified_times else "unknown"
# HASH COVERAGE
hash_count = sum(1 for r in items if r.get("hash_sha1"))
hash_ratio = hash_count / count
# EXTENSION DETAILS
ext_details = defaultdict(
lambda: {"count": 0, "total_size": 0, "cats": Counter()}
)
for r in items:
ext = (r.get("extension") or "").lower()
size = r.get("size_bytes", 0) or 0
cat = r.get("category", "unknown")
ext_details[ext]["count"] += 1
ext_details[ext]["total_size"] += size
ext_details[ext]["cats"][cat] += 1
# PURPOSE INFERENCE
purpose = []
if "faces" in cats or "scenery" in cats or "objects" in cats:
purpose.append("image/photo collection")
if "document" in cats or "digitized_docs" in cats:
purpose.append("documents or backups")
if "girls" in cats or "adult_video" in cats:
purpose.append("sensitive/private material")
if "source" in cats:
purpose.append("source code or development files")
if "archive" in cats or "iso" in cats:
purpose.append("long-term archival content")
if not purpose:
purpose.append("mixed or unclear")
# WRITE DATASET SUMMARY
lines.append(f"## Dataset: {ds}\n")
lines.append(f"- Files: **{count}**")
lines.append(f"- Total size: **{human_size(total_size)}**")
lines.append(f"- Average file size: **{human_size(avg_size)}**")
lines.append(f"- Unique extensions: **{unique_exts}**")
lines.append(f"- Extension entropy: **{entropy_label}**")
lines.append(f"- Unique root folders: **{unique_roots}**")
lines.append(f"- Folder entropy: **{folder_entropy}**")
lines.append(
f"- Dominant extension: `{top_ext}` ({top_ext_count} files, {dominance_ratio:.1%})"
)
lines.append(
f"- Dominant categories: {', '.join(f'{c} ({n})' for c, n in cats.most_common(3))}"
)
lines.append(f"- Oldest file: {oldest}")
lines.append(f"- Newest file: {newest}")
lines.append(f"- Hash coverage: {hash_ratio:.1%}")
lines.append(f"- Likely purpose: {', '.join(purpose)}\n")
# ROOT FOLDER BREAKDOWN
lines.append("### Top Root Folders\n")
lines.append("| Folder | Count | Percent |")
lines.append("|--------|--------|----------|")
for folder, n in roots.most_common(20):
pct = (n / count) * 100
lines.append(f"| `{folder}` | {n} | {pct:.1f}% |")
lines.append("")
# TOP 50 EXTENSIONS
lines.append("### Top 50 Extensions\n")
lines.append(
"| Extension | Count | Total Size | Avg Size | Dominant Categories |"
)
lines.append(
"|----------|--------|------------|----------|----------------------|"
)
for ext, info in sorted(ext_details.items(), key=lambda x: -x[1]["count"])[:50]:
count_e = info["count"]
total_e = info["total_size"]
avg_e = total_e / count_e if count_e else 0
dom_cats = ", ".join(f"{c} ({n})" for c, n in info["cats"].most_common(3))
lines.append(
f"| `{ext or 'no_ext'}` | {count_e} | {human_size(total_e)} | {human_size(avg_e)} | {dom_cats} |"
)
lines.append("\n---\n")
# WRITE FILE
with open(OUTPUT_FILE, "w") as out:
out.write("\n".join(lines))
print(f"\n✅ GlacierLens master summary saved to: {OUTPUT_FILE}")
if __name__ == "__main__":
main()