Files
CryoLens/files_inventory.py
2025-12-26 02:32:34 -05:00

279 lines
5.8 KiB
Python
Executable File

# The script scans multiple TrueNAS datasets, collects detailed metadata about every file,
# categorizes them by type, and saves the results to a JSON file (glacier_files_inventory.json).
# This is useful for inventory management, organization, and analysis of files across your storage.
#
import hashlib
import json
import os
from datetime import datetime
# ---------------------------------------------
# CONFIGURATION
# ---------------------------------------------
# Add all your NFS mounted TrueNAS datasets here
DATASETS = {
"GlacierBucket1": "/mnt/bucket1",
"GlacierArchives": "/mnt/archives",
"GlacierBackups": "/mnt/backups",
"GlacierVault": "/mnt/vault_bucket",
}
OUTPUT = "glacier_files_inventory.json"
# Enable hashing? (slow on remote mounts)
ENABLE_HASH = False
# ---------------------------------------------
# HELPERS
# ---------------------------------------------
def sha1(path):
if not ENABLE_HASH:
return None
try:
h = hashlib.sha1()
with open(path, "rb") as f:
for chunk in iter(lambda: f.read(8192), b""):
h.update(chunk)
return h.hexdigest()
except:
return None
def categorize(ext, filename):
ext = ext.lower()
fname = filename.lower()
# DJI drone media detection (case-insensitive)
if fname.startswith("dji"):
if ext in [".jpg", ".jpeg", ".dng", ".rw2"]:
return "dji_photo"
if ext in [".mp4", ".mov"]:
return "dji_video"
if ext in [".srt", ".lrv"]:
return "dji_sidecar"
# Tax / finance software
if ext in [
# TurboTax (US)
".tax",
".tax2018",
".tax2019",
".tax2020",
".tax2021",
".tax2022",
".tax2023",
".tax2024",
".tax2025",
".tax2026",
# UFile (Canada)
".u12",
".u13",
".u14",
".u15",
".u16",
".u17",
".u18",
".u19",
".u20",
".u21",
".u22",
".u23",
".u24",
".u25",
".u26",
# TaxTron (Canada)
".tt18",
".tt19",
".tt20",
".tt21",
".tt22",
".tt23",
".tt24",
".tt25",
".tt26",
# Profile (Canada)
".pr0",
".pr1",
".pr2",
".pr3",
".pr4",
".pr5",
".pr6",
".pr7",
".pr8",
".pr9",
".p20",
".p21",
".p22",
".p23",
".fx0",
".fx1",
".fx2",
# Other finance formats
".qif",
".ofx",
".qfx",
".money",
".t1",
".t2",
]:
return "tax"
# Video
if ext in [".mp4", ".mkv", ".avi", ".mov"]:
return "video"
# Audio
if ext in [".mp3", ".wav", ".flac", ".aac"]:
return "audio"
# Documents (expanded)
if ext in [
".doc",
".docx",
".dot",
".dotx",
".xls",
".xlsx",
".xlsm",
".xlsb",
".xlt",
".xltx",
".ppt",
".pptx",
".pptm",
".pot",
".potx",
".odt",
".ods",
".odp",
".odg",
".pdf",
".rtf",
".txt",
".md",
".csv",
".tsv",
".epub",
]:
return "document"
# Images
if ext in [".jpg", ".jpeg", ".png", ".gif", ".raw", ".tiff"]:
return "image"
# ISOs / disk images
if ext in [".iso", ".img"]:
return "iso"
# Archives / compressed files
if ext in [
".zip",
".tar",
".gz",
".tgz",
".tar.gz",
".bz2",
".tbz",
".tar.bz2",
".xz",
".txz",
".tar.xz",
".7z",
".rar",
".lz",
".lzma",
".zst",
]:
return "archive"
# Source code
if ext in [
".c",
".h",
".cpp",
".hpp",
".cc",
".sh",
".bash",
".zsh",
".py",
".js",
".ts",
".go",
".rs",
".java",
".php",
".rb",
".swift",
".cs",
".sql",
".yaml",
".yml",
".json",
".xml",
".ini",
".cfg",
".toml",
".make",
".mk",
]:
return "source"
return "other"
# ---------------------------------------------
# MAIN SCAN LOGIC
# ---------------------------------------------
inventory = []
# print(f"Running the script ...")
for dataset_name, root in DATASETS.items():
print(f"Scanning dataset: {dataset_name}{root}")
if not os.path.exists(root):
print(f"⚠ WARNING: Path does not exist: {root}")
continue
for dirpath, _, files in os.walk(root):
for f in files:
full = os.path.join(dirpath, f)
try:
stat = os.stat(full)
ext = os.path.splitext(f)[1]
inventory.append(
{
"dataset": dataset_name,
"path": full,
"filename": f,
"extension": ext,
"size_bytes": stat.st_size,
"modified": stat.st_mtime,
"created": stat.st_ctime,
"hash_sha1": sha1(full),
"category": categorize(ext, f),
}
)
except Exception as e:
print(f"Error reading file {full}: {e}")
# ---------------------------------------------
# SAVE OUTPUT
# ---------------------------------------------
with open(OUTPUT, "w") as out:
json.dump(inventory, out, indent=2)
print(f"\nInventory complete. {len(inventory)} files indexed.")
print(f"Output saved to: {OUTPUT}")