100 lines
2.3 KiB
Python
100 lines
2.3 KiB
Python
import json
|
|
from pathlib import Path
|
|
|
|
INPUT_FILE = "glacier_files_inventory.json"
|
|
OUTPUT_FILE = "glacier_file_dataset_mapping.json"
|
|
|
|
# --- Dataset rules ---------------------------------------------------------
|
|
|
|
EXT_GROUPS = {
|
|
"GlacierCode": {
|
|
".js",
|
|
".ts",
|
|
".java",
|
|
".php",
|
|
".py",
|
|
".c",
|
|
".h",
|
|
".css",
|
|
".html",
|
|
".xml",
|
|
".json",
|
|
".yml",
|
|
".md",
|
|
},
|
|
"GlacierPhotos": {".jpg", ".jpeg", ".png", ".gif", ".svg"},
|
|
"GlacierDroneMedia": {".dng", ".mp4", ".mov", ".srt"},
|
|
"GlacierDocuments": {
|
|
".pdf",
|
|
".docx",
|
|
".doc",
|
|
".xlsx",
|
|
".pptx",
|
|
".txt",
|
|
".odt",
|
|
".ods",
|
|
".odp",
|
|
},
|
|
"GlacierArchives": {".zip", ".7z", ".gz", ".tar", ".tgz", ".rar", ".qbb", ".tib"},
|
|
"GlacierLegacyDB": {".frm", ".myd", ".myi", ".dbf", ".gfr", ".pof", ".cdx"},
|
|
"GlacierSystemImages": {".iso", ".vhd", ".ova", ".exe", ".dll"},
|
|
}
|
|
|
|
CATEGORY_GROUPS = {
|
|
"tax": "GlacierSensitive",
|
|
"dji_photo": "GlacierDroneMedia",
|
|
"dji_video": "GlacierDroneMedia",
|
|
"dji_sidecar": "GlacierDroneMedia",
|
|
"document": "GlacierDocuments",
|
|
"image": "GlacierPhotos",
|
|
"source": "GlacierCode",
|
|
"archive": "GlacierArchives",
|
|
}
|
|
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def classify_file(record):
|
|
ext = (record.get("extension") or "").lower()
|
|
cat = record.get("category", "").lower()
|
|
|
|
# 1. Category-based classification (strongest signal)
|
|
if cat in CATEGORY_GROUPS:
|
|
return CATEGORY_GROUPS[cat]
|
|
|
|
# 2. Extension-based classification
|
|
for dataset, extset in EXT_GROUPS.items():
|
|
if ext in extset:
|
|
return dataset
|
|
|
|
# 3. Sensitive heuristics
|
|
if "tax" in record.get("path", "").lower():
|
|
return "GlacierSensitive"
|
|
|
|
# 4. Fallback
|
|
return "GlacierMisc"
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def main():
|
|
with open(INPUT_FILE, "r") as f:
|
|
records = json.load(f)
|
|
|
|
mapping = {}
|
|
|
|
for r in records:
|
|
path = r.get("path")
|
|
dataset = classify_file(r)
|
|
mapping[path] = dataset
|
|
|
|
with open(OUTPUT_FILE, "w") as out:
|
|
json.dump(mapping, out, indent=2)
|
|
|
|
print(f"Mapping written to {OUTPUT_FILE}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|