Files
CryoLens/summarize_chunks4.py
2025-12-26 01:40:13 -05:00

136 lines
5.1 KiB
Python
Executable File
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import json
from collections import Counter, defaultdict
from datetime import datetime
INPUT_FILE = "glacier_files_inventory.json"
OUTPUT_FILE = "glacier_dataset_summaries.md"
def human_size(n):
for unit in ["B", "KB", "MB", "GB", "TB"]:
if n < 1024:
return f"{n:.1f}{unit}"
n /= 1024
return f"{n:.1f}PB"
def ts(t):
try:
return datetime.fromtimestamp(t).strftime("%Y-%m-%d")
except:
return "unknown"
def main():
with open(INPUT_FILE, "r") as f:
records = json.load(f)
# Group by dataset
by_dataset = defaultdict(list)
for r in records:
ds = r.get("dataset", "unknown")
by_dataset[ds].append(r)
lines = []
lines.append("# GlacierEdge DatasetFocused Semantic Summaries\n")
lines.append("Short, highsignal summaries designed for LLM reasoning about reorganization.\n")
for ds, items in sorted(by_dataset.items(), key=lambda x: -len(x[1])):
count = len(items)
total_size = sum(r.get("size_bytes", 0) or 0 for r in items)
avg_size = total_size / count if count else 0
# Counters
cats = Counter(r.get("category", "unknown") for r in items)
exts = Counter((r.get("extension") or "").lower() for r in items)
# Extension entropy
unique_exts = len(exts)
top_ext, top_ext_count = exts.most_common(1)[0] if exts else ("none", 0)
dominance_ratio = top_ext_count / count if count else 0
if unique_exts > 50:
entropy_label = "Very high (chaotic/mixed-purpose)"
elif unique_exts > 30:
entropy_label = "High (likely mixed content)"
elif unique_exts > 15:
entropy_label = "Moderate"
else:
entropy_label = "Low (coherent dataset)"
# Timestamps
modified_times = [r.get("modified") for r in items if isinstance(r.get("modified"), (int, float))]
created_times = [r.get("created") for r in items if isinstance(r.get("created"), (int, float))]
oldest = ts(min(modified_times)) if modified_times else "unknown"
newest = ts(max(modified_times)) if modified_times else "unknown"
# Hash coverage
hash_count = sum(1 for r in items if r.get("hash_sha1"))
hash_ratio = hash_count / count if count else 0
# Example paths
examples = [r.get("path", "") for r in items[:5]]
# Anomalies
anomalies = []
for r in items[:50000]: # cap for speed
cat = r.get("category", "")
path = r.get("path", "")
ext = (r.get("extension") or "").lower()
if cat == "tax" and "photo" in ds.lower():
anomalies.append(f"Tax file in photo dataset: {path}")
if cat == "source" and ("photo" in ds.lower() or "video" in ds.lower()):
anomalies.append(f"Source code in media dataset: {path}")
if cat == "adult_video" and "vault" not in ds.lower():
anomalies.append(f"Adult content outside secure dataset: {path}")
if ext in [".js", ".py"] and "node_modules" not in path and "code" not in ds.lower():
anomalies.append(f"Code file in non-code dataset: {path}")
# Purpose inference
purpose = []
if "faces" in cats or "scenery" in cats or "objects" in cats:
purpose.append("image/photo collection")
if "document" in cats or "digitized_docs" in cats:
purpose.append("documents or backups")
if "girls" in cats or "adult_video" in cats:
purpose.append("sensitive/private material")
if "source" in cats:
purpose.append("source code or development files")
if "archive" in cats or "iso" in cats:
purpose.append("long-term archival content")
if not purpose:
purpose.append("mixed or unclear")
# Write summary
lines.append(f"## Dataset: {ds}\n")
lines.append(f"- Files: **{count}**")
lines.append(f"- Total size: **{human_size(total_size)}**")
lines.append(f"- Average file size: **{human_size(avg_size)}**")
lines.append(f"- Unique extensions: **{unique_exts}**")
lines.append(f"- Extension entropy: **{entropy_label}**")
lines.append(f"- Dominant extension: `{top_ext}` ({top_ext_count} files, {dominance_ratio:.1%} of dataset)")
lines.append(f"- Dominant categories: {', '.join(f'{c} ({n})' for c, n in cats.most_common(3))}")
lines.append(f"- Oldest file: {oldest}")
lines.append(f"- Newest file: {newest}")
lines.append(f"- Hash coverage: {hash_ratio:.1%}")
lines.append(f"- Likely purpose: {', '.join(purpose)}")
if anomalies:
lines.append(f"- Anomalies detected: **{len(anomalies)}** (misplaced or suspicious files)")
else:
lines.append("- Anomalies detected: none")
lines.append("- Example paths:")
for ex in examples:
lines.append(f" - `{ex}`")
lines.append("")
with open(OUTPUT_FILE, "w") as out:
out.write("\n".join(lines))
print(f"\n✅ Dataset-focused summaries saved to: {OUTPUT_FILE}")
if __name__ == "__main__":
main()