CryoLens/files_inventory.py

# The script scans multiple TrueNAS datasets, collects detailed metadata about every file,
# categorizes them by type, and saves the results to a JSON file (glacier_files_inventory.json).
# This is useful for inventory management, organization, and analysis of files across your storage.
#
import hashlib
import json
import os
from datetime import datetime

# ---------------------------------------------
# CONFIGURATION
# ---------------------------------------------

# Add all your NFS mounted TrueNAS datasets here
DATASETS = {
    "GlacierBucket1": "/mnt/bucket1",
    "GlacierArchives": "/mnt/archives",
    "GlacierBackups": "/mnt/backups",
    "GlacierVault": "/mnt/vault_bucket",
}

OUTPUT = "glacier_files_inventory.json"

# Enable hashing? (slow on remote mounts)
ENABLE_HASH = False


# ---------------------------------------------
# HELPERS
# ---------------------------------------------


def sha1(path):
    if not ENABLE_HASH:
        return None
    try:
        h = hashlib.sha1()
        with open(path, "rb") as f:
            for chunk in iter(lambda: f.read(8192), b""):
                h.update(chunk)
        return h.hexdigest()
    except:
        return None


def categorize(ext, filename):
    ext = ext.lower()
    fname = filename.lower()

    # DJI drone media detection (case-insensitive)
    if fname.startswith("dji"):
        if ext in [".jpg", ".jpeg", ".dng", ".rw2"]:
            return "dji_photo"
        if ext in [".mp4", ".mov"]:
            return "dji_video"
        if ext in [".srt", ".lrv"]:
            return "dji_sidecar"

    # Tax / finance software
    if ext in [
        # TurboTax (US)
        ".tax",
        ".tax2018",
        ".tax2019",
        ".tax2020",
        ".tax2021",
        ".tax2022",
        ".tax2023",
        ".tax2024",
        ".tax2025",
        ".tax2026",
        # UFile (Canada)
        ".u12",
        ".u13",
        ".u14",
        ".u15",
        ".u16",
        ".u17",
        ".u18",
        ".u19",
        ".u20",
        ".u21",
        ".u22",
        ".u23",
        ".u24",
        ".u25",
        ".u26",
        # TaxTron (Canada)
        ".tt18",
        ".tt19",
        ".tt20",
        ".tt21",
        ".tt22",
        ".tt23",
        ".tt24",
        ".tt25",
        ".tt26",
        # Profile (Canada)
        ".pr0",
        ".pr1",
        ".pr2",
        ".pr3",
        ".pr4",
        ".pr5",
        ".pr6",
        ".pr7",
        ".pr8",
        ".pr9",
        ".p20",
        ".p21",
        ".p22",
        ".p23",
        ".fx0",
        ".fx1",
        ".fx2",
        # Other finance formats
        ".qif",
        ".ofx",
        ".qfx",
        ".money",
        ".t1",
        ".t2",
    ]:
        return "tax"

    # Video
    if ext in [".mp4", ".mkv", ".avi", ".mov"]:
        return "video"

    # Audio
    if ext in [".mp3", ".wav", ".flac", ".aac"]:
        return "audio"

    # Documents (expanded)
    if ext in [
        ".doc",
        ".docx",
        ".dot",
        ".dotx",
        ".xls",
        ".xlsx",
        ".xlsm",
        ".xlsb",
        ".xlt",
        ".xltx",
        ".ppt",
        ".pptx",
        ".pptm",
        ".pot",
        ".potx",
        ".odt",
        ".ods",
        ".odp",
        ".odg",
        ".pdf",
        ".rtf",
        ".txt",
        ".md",
        ".csv",
        ".tsv",
        ".epub",
    ]:
        return "document"

    # Images
    if ext in [".jpg", ".jpeg", ".png", ".gif", ".raw", ".tiff"]:
        return "image"

    # ISOs / disk images
    if ext in [".iso", ".img"]:
        return "iso"

    # Archives / compressed files
    if ext in [
        ".zip",
        ".tar",
        ".gz",
        ".tgz",
        ".tar.gz",
        ".bz2",
        ".tbz",
        ".tar.bz2",
        ".xz",
        ".txz",
        ".tar.xz",
        ".7z",
        ".rar",
        ".lz",
        ".lzma",
        ".zst",
    ]:
        return "archive"

    # Source code
    if ext in [
        ".c",
        ".h",
        ".cpp",
        ".hpp",
        ".cc",
        ".sh",
        ".bash",
        ".zsh",
        ".py",
        ".js",
        ".ts",
        ".go",
        ".rs",
        ".java",
        ".php",
        ".rb",
        ".swift",
        ".cs",
        ".sql",
        ".yaml",
        ".yml",
        ".json",
        ".xml",
        ".ini",
        ".cfg",
        ".toml",
        ".make",
        ".mk",
    ]:
        return "source"

    return "other"


# ---------------------------------------------
# MAIN SCAN LOGIC
# ---------------------------------------------

inventory = []

# print(f"Running the script ...")

for dataset_name, root in DATASETS.items():
    print(f"Scanning dataset: {dataset_name} → {root}")

    if not os.path.exists(root):
        print(f"⚠ WARNING: Path does not exist: {root}")
        continue

    for dirpath, _, files in os.walk(root):
        for f in files:
            full = os.path.join(dirpath, f)
            try:
                stat = os.stat(full)
                ext = os.path.splitext(f)[1]

                inventory.append(
                    {
                        "dataset": dataset_name,
                        "path": full,
                        "filename": f,
                        "extension": ext,
                        "size_bytes": stat.st_size,
                        "modified": stat.st_mtime,
                        "created": stat.st_ctime,
                        "hash_sha1": sha1(full),
                        "category": categorize(ext, f),
                    }
                )

            except Exception as e:
                print(f"Error reading file {full}: {e}")


# ---------------------------------------------
# SAVE OUTPUT
# ---------------------------------------------

with open(OUTPUT, "w") as out:
    json.dump(inventory, out, indent=2)

print(f"\nInventory complete. {len(inventory)} files indexed.")
print(f"Output saved to: {OUTPUT}")