Files
CryoLens/file_to_dataser_mapping.py
2025-12-26 03:29:35 -05:00

100 lines
2.3 KiB
Python

import json
from pathlib import Path
INPUT_FILE = "glacier_files_inventory.json"
OUTPUT_FILE = "glacier_file_dataset_mapping.json"
# --- Dataset rules ---------------------------------------------------------
EXT_GROUPS = {
"GlacierCode": {
".js",
".ts",
".java",
".php",
".py",
".c",
".h",
".css",
".html",
".xml",
".json",
".yml",
".md",
},
"GlacierPhotos": {".jpg", ".jpeg", ".png", ".gif", ".svg"},
"GlacierDroneMedia": {".dng", ".mp4", ".mov", ".srt"},
"GlacierDocuments": {
".pdf",
".docx",
".doc",
".xlsx",
".pptx",
".txt",
".odt",
".ods",
".odp",
},
"GlacierArchives": {".zip", ".7z", ".gz", ".tar", ".tgz", ".rar", ".qbb", ".tib"},
"GlacierLegacyDB": {".frm", ".myd", ".myi", ".dbf", ".gfr", ".pof", ".cdx"},
"GlacierSystemImages": {".iso", ".vhd", ".ova", ".exe", ".dll"},
}
CATEGORY_GROUPS = {
"tax": "GlacierSensitive",
"dji_photo": "GlacierDroneMedia",
"dji_video": "GlacierDroneMedia",
"dji_sidecar": "GlacierDroneMedia",
"document": "GlacierDocuments",
"image": "GlacierPhotos",
"source": "GlacierCode",
"archive": "GlacierArchives",
}
# ---------------------------------------------------------------------------
def classify_file(record):
ext = (record.get("extension") or "").lower()
cat = record.get("category", "").lower()
# 1. Category-based classification (strongest signal)
if cat in CATEGORY_GROUPS:
return CATEGORY_GROUPS[cat]
# 2. Extension-based classification
for dataset, extset in EXT_GROUPS.items():
if ext in extset:
return dataset
# 3. Sensitive heuristics
if "tax" in record.get("path", "").lower():
return "GlacierSensitive"
# 4. Fallback
return "GlacierMisc"
# ---------------------------------------------------------------------------
def main():
with open(INPUT_FILE, "r") as f:
records = json.load(f)
mapping = {}
for r in records:
path = r.get("path")
dataset = classify_file(r)
mapping[path] = dataset
with open(OUTPUT_FILE, "w") as out:
json.dump(mapping, out, indent=2)
print(f"Mapping written to {OUTPUT_FILE}")
if __name__ == "__main__":
main()