CryoLens/embed-007.py

#!/usr/bin/env python3
import os
from pathlib import Path
from tqdm import tqdm
import requests
import json
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams, PointStruct
import hashlib
from typing import List, Dict, Optional
import time

class AnythingLLMDocumentProcessor:
    def __init__(
        self,
        anythingllm_url: str = "http://10.100.50.16:30239",
        anythingllm_api_key: str = "TN4TAY5-BB6M8AP-KF5PWWF-E3TFJ1E",
        workspace_slug: str = "dialogue",
        qdrant_url: str = "http://10.100.50.16:6333",
        collection_name: str = "dialogue",
        move_after_embed: bool = True,
        embedded_folder_name: str = "embedded"
    ):
        """
        Initialize the document processor for AnythingLLM.

        Args:
            anythingllm_url: URL of your local AnythingLLM instance
            anythingllm_api_key: API key for AnythingLLM
            workspace_slug: The slug of your workspace (e.g., 'dialogue')
            qdrant_url: URL of your local Qdrant instance
            collection_name: Name of the Qdrant collection
            move_after_embed: If True, move files to embedded folder after successful embedding
            embedded_folder_name: Name of the folder to move embedded files to
        """
        self.anythingllm_url = anythingllm_url.rstrip('/')
        self.anythingllm_api_key = anythingllm_api_key
        self.workspace_slug = workspace_slug
        self.qdrant_client = QdrantClient(url=qdrant_url)
        self.collection_name = collection_name
        self.move_after_embed = move_after_embed
        self.embedded_folder_name = embedded_folder_name

        self.headers = {
            "Accept": "application/json",
        }

        if self.anythingllm_api_key:
            self.headers["Authorization"] = f"Bearer {self.anythingllm_api_key}"

        print(f"✓ Initialized AnythingLLM Document Processor")
        print(f"  Workspace: {self.workspace_slug}")
        print(f"  Move after embed: {self.move_after_embed}")

    def _verify_workspace(self):
        """Verify that the workspace exists."""
        try:
            url = f"{self.anythingllm_url}/api/v1/workspace/{self.workspace_slug}"
            response = requests.get(url, headers=self.headers)

            if response.status_code == 200:
                workspace_data = response.json()
                print(f"✓ Connected to workspace: {workspace_data.get('workspace', {}).get('name', self.workspace_slug)}")
            else:
                print(f"⚠ Warning: Could not verify workspace '{self.workspace_slug}'")
                print(f"  Status code: {response.status_code}")
        except Exception as e:
            print(f"⚠ Warning: Could not verify workspace: {e}")

    def verify_embeddings(self):
        """Verify which documents are actually embedded in the workspace."""
        print("\n" + "="*60)
        print("Verifying embedded documents...")
        print("="*60)

        try:
            url = f"{self.anythingllm_url}/api/v1/workspace/{self.workspace_slug}"
            response = requests.get(url, headers=self.headers)
            response.raise_for_status()

            workspace_data = response.json()
            documents = workspace_data.get('workspace', {}).get('documents', [])

            if not documents:
                print("No documents found in workspace")
                return []

            print(f"\nTotal documents in workspace: {len(documents)}")
            print("\nDocument Status:")
            print("-"*60)

            for doc in documents:
                doc_name = doc.get('docpath', 'Unknown')
                # Extract just filename
                filename = Path(doc_name).name if doc_name != 'Unknown' else 'Unknown'
                print(f"📄 {filename}")

            print("-"*60)
            print(f"Total: {len(documents)} document(s)")
            print("="*60 + "\n")

            return documents

        except Exception as e:
            print(f"Error verifying embeddings: {e}")
            return []

    def list_workspaces(self):
        """List all available workspaces."""
        try:
            url = f"{self.anythingllm_url}/api/v1/workspaces"
            response = requests.get(url, headers=self.headers)
            response.raise_for_status()

            workspaces = response.json().get('workspaces', [])

            if workspaces:
                print("\nAvailable workspaces:")
                print("=" * 60)
                for ws in workspaces:
                    print(f"  Name: {ws.get('name')}")
                    print(f"  Slug: {ws.get('slug')}")
                    print("-" * 60)
            else:
                print("No workspaces found")

            return workspaces
        except Exception as e:
            print(f"Error listing workspaces: {e}")
            return []

    def _upload_file_to_anythingllm(self, file_path: Path) -> Dict:
        """Upload a file to AnythingLLM for processing."""
        url = f"{self.anythingllm_url}/api/v1/document/upload"

        try:
            with open(file_path, 'rb') as f:
                files = {
                    'file': (file_path.name, f, self._get_mime_type(file_path))
                }

                response = requests.post(
                    url,
                    headers={"Authorization": f"Bearer {self.anythingllm_api_key}"} if self.anythingllm_api_key else {},
                    files=files
                )
                response.raise_for_status()
                return response.json()
        except Exception as e:
            raise Exception(f"Failed to upload file: {e}")

    def _embed_document_in_workspace(self, document_location: str) -> Dict:
        """Add an uploaded document to the workspace for embedding."""
        url = f"{self.anythingllm_url}/api/v1/workspace/{self.workspace_slug}/update-embeddings"

        payload = {
            "adds": [document_location]
        }

        try:
            response = requests.post(
                url,
                headers=self.headers,
                json=payload
            )
            response.raise_for_status()
            return response.json()
        except Exception as e:
            raise Exception(f"Failed to embed document in workspace: {e}")

    def _get_mime_type(self, file_path: Path) -> str:
        """Get MIME type based on file extension."""
        mime_types = {
            '.txt': 'text/plain',
            '.pdf': 'application/pdf',
            '.docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
            '.doc': 'application/msword',
            '.md': 'text/markdown',
            '.csv': 'text/csv',
            '.json': 'application/json',
            '.html': 'text/html',
        }
        return mime_types.get(file_path.suffix.lower(), 'application/octet-stream')

    def process_folder(self, folder_path: str, file_extensions: List[str] = None):
        """
        Process all files in a folder and upload to AnythingLLM workspace.

        Args:
            folder_path: Path to folder containing files
            file_extensions: List of file extensions to process (e.g., ['.txt', '.md'])
                           If None, process all files
        """
        folder = Path(folder_path)

        if not folder.exists():
            raise ValueError(f"Folder not found: {folder_path}")

        # Verify workspace before processing
        self._verify_workspace()

        # Create embedded folder if it doesn't exist
        embedded_folder = folder / self.embedded_folder_name
        if self.move_after_embed:
            embedded_folder.mkdir(exist_ok=True)
            print(f"✓ Embedded files will be moved to: {embedded_folder}")

        # Get list of files (excluding the embedded folder)
        if file_extensions:
            files = [f for f in folder.rglob('*')
                    if f.is_file()
                    and f.suffix in file_extensions
                    and self.embedded_folder_name not in f.parts]
        else:
            files = [f for f in folder.rglob('*')
                    if f.is_file()
                    and self.embedded_folder_name not in f.parts]

        if not files:
            print("No files found to process")
            return

        print(f"\nFound {len(files)} file(s) to process")
        print(f"Target workspace: {self.workspace_slug}\n")

        # Process each file with progress bar
        successful = 0
        failed = 0
        moved = 0

        for file_path in tqdm(files, desc="Processing files", unit="file"):
            try:
                self._process_single_file(file_path)
                successful += 1

                # Move file after successful embedding if enabled
                if self.move_after_embed:
                    try:
                        # Preserve directory structure inside embedded folder
                        relative_path = file_path.relative_to(folder)
                        destination = embedded_folder / relative_path

                        # Create subdirectories if needed
                        destination.parent.mkdir(parents=True, exist_ok=True)

                        # Move the file
                        file_path.rename(destination)
                        moved += 1
                        tqdm.write(f"✓ Moved: {file_path.name} → embedded/{relative_path}")
                    except Exception as e:
                        tqdm.write(f"⚠ Could not move {file_path.name}: {e}")

            except Exception as e:
                failed += 1
                tqdm.write(f"✗ Failed to process {file_path.name}: {str(e)}")

        print(f"\n{'='*60}")
        print(f"Processing complete!")
        print(f"✓ Successful: {successful}/{len(files)}")
        print(f"✗ Failed: {failed}/{len(files)}")
        if self.move_after_embed:
            print(f"📁 Moved to embedded/: {moved}/{successful}")
        print(f"{'='*60}\n")

        # Wait a moment for embeddings to process, then verify
        if successful > 0:
            print("Waiting 5 seconds for embeddings to process...")
            time.sleep(5)
            self.verify_embeddings()

        # Show how to access in UI
        print(f"\nYou can access these documents in AnythingLLM UI:")
        print(f"  → {self.anythingllm_url}/workspace/{self.workspace_slug}")

    def _process_single_file(self, file_path: Path):
        """Process a single file: upload and embed in workspace."""
        steps = ['Uploading', 'Embedding']

        with tqdm(total=len(steps), desc=f"  {file_path.name}",
                  leave=False, unit="step", position=1) as pbar:

            # Step 1: Upload file to AnythingLLM
            pbar.set_description(f"  {file_path.name} - Uploading")
            upload_result = self._upload_file_to_anythingllm(file_path)

            # Extract document location from upload result
            documents = upload_result.get('documents', [])
            if not documents:
                raise Exception("No document location returned from upload")

            document_location = documents[0].get('location')
            if not document_location:
                raise Exception("Document location not found in upload response")

            pbar.update(1)

            # Step 2: Embed document in workspace
            pbar.set_description(f"  {file_path.name} - Embedding")
            embed_result = self._embed_document_in_workspace(document_location)
            pbar.update(1)

            # Small delay to avoid rate limiting
            time.sleep(0.1)


def main():
    """Example usage"""

    # Configuration - defaults are now set in the class
    FOLDER_PATH = "./documents"  # Change to your folder path

    # Specify file types to process
    FILE_EXTENSIONS = ['.txt', '.md', '.pdf', '.docx', '.csv', '.json']

    # Initialize processor with default values
    # Set move_after_embed=False if you want to keep files in original location
    processor = AnythingLLMDocumentProcessor(move_after_embed=True)

    # Optional: List available workspaces
    # processor.list_workspaces()

    # Optional: Verify embeddings before processing
    # processor.verify_embeddings()

    # Process folder
    processor.process_folder(FOLDER_PATH, file_extensions=FILE_EXTENSIONS)


if __name__ == "__main__":
    main()