Source code for IndicPhotoOCR.script_identification.vit.vit_infer

from transformers import AutoImageProcessor,ViTForImageClassification,pipeline
from PIL import Image
from datasets import DatasetDict,Dataset,ClassLabel
import torchvision.transforms as transforms
import numpy as np
import csv
import os
import argparse
import requests
from tqdm import tqdm
import zipfile
import time
import glob
from IndicPhotoOCR.script_identification.vit.config import infer_config as config

model_info = {
    "hindi": {
        "path": "models/hindienglish",
        "url" : "https://github.com/Bhashini-IITJ/ScriptIdentification/releases/download/Vit_Models/hindienglish.zip",
        "subcategories": ["hindi", "english"]
    },
    "assamese": {
        "path": "models/hindienglishassamese",
        "url": "https://github.com/Bhashini-IITJ/ScriptIdentification/releases/download/Vit_Models/hindienglishassamese.zip",
        "subcategories": ["hindi", "english", "assamese"]
    },
    "bengali": {
        "path": "models/hindienglishbengali",
        "url" : "https://github.com/Bhashini-IITJ/ScriptIdentification/releases/download/Vit_Models/hindienglishbengali.zip",
        "subcategories": ["hindi", "english", "bengali"]
    },
    "gujarati": {
        "path": "models/hindienglishgujarati",
        "url" : "https://github.com/Bhashini-IITJ/ScriptIdentification/releases/download/Vit_Models/hindienglishgujarati.zip",
        "subcategories": ["hindi", "english", "gujarati"]
    },
    "kannada": {
        "path": "models/hindienglishkannada",
        "url" : "https://github.com/Bhashini-IITJ/ScriptIdentification/releases/download/Vit_Models/hindienglishkannada.zip",
        "subcategories": ["hindi", "english", "kannada"]
    },
    "malayalam": {
        "path": "models/hindienglishmalayalam",
        "url" : "https://github.com/Bhashini-IITJ/ScriptIdentification/releases/download/Vit_Models/hindienglishmalayalam.zip",
        "subcategories": ["hindi", "english", "malayalam"]
    },
    "marathi": {
        "path": "models/hindienglishmarathi",
        "url" : "https://github.com/Bhashini-IITJ/ScriptIdentification/releases/download/Vit_Models/hindienglishmarathi.zip",
        "subcategories": ["hindi", "english", "marathi"]
    },
    "meitei": {
        "path": "models/hindienglishmeitei",
        "url" : "https://github.com/Bhashini-IITJ/ScriptIdentification/releases/download/Vit_Models/hindienglishmeitei.zip",
        "subcategories": ["hindi", "english", "meitei"]
    },
    "odia": {
        "path": "models/hindienglishodia",
        "url" : "https://github.com/Bhashini-IITJ/ScriptIdentification/releases/download/Vit_Models/hindienglishodia.zip",
        "subcategories": ["hindi", "english", "odia"]
    },
    "punjabi": {
        "path": "models/hindienglishpunjabi",
        "url" : "https://github.com/Bhashini-IITJ/ScriptIdentification/releases/download/Vit_Models/hindienglishpunjabi.zip",
        "subcategories": ["hindi", "english", "punjabi"]
    },
    "tamil": {
        "path": "models/hindienglishtamil",
        "url" : "https://github.com/Bhashini-IITJ/ScriptIdentification/releases/download/Vit_Models/hindienglishtamil.zip",
        "subcategories": ["hindi", "english", "tamil"]
    },
    "telugu": {
        "path": "models/hindienglishtelugu",
        "url" : "https://github.com/Bhashini-IITJ/ScriptIdentification/releases/download/Vit_Models/hindienglishtelugu.zip",
        "subcategories": ["hindi", "english", "telugu"]
    },
    "auto": {
        "path": "models/12_classes",
        "url" : "https://github.com/Bhashini-IITJ/ScriptIdentification/releases/download/Vit_Models/12_classes.zip",
        "subcategories": ["hindi", "english", "assamese","bengali","gujarati","kannada","malayalam","marathi","odia","punjabi","tamil","telegu"]
    },
    "10C": {
        "path": "models/12_classes",
        "url" : "https://github.com/Bhashini-IITJ/ScriptIdentification/releases/download/Vit_Models/10_classes.zip",
        "subcategories": ["hindi", "english", "assamese","bengali","gujarati","kannada","malayalam","marathi","odia","punjabi","tamil","telegu"]
    },
    

}

pretrained_vit_model = config['pretrained_vit_model']
processor = AutoImageProcessor.from_pretrained(pretrained_vit_model,use_fast=True)



[docs]
class VIT_identifier:
    """
    A class for script identification using a ViT (Vision Transformer) model.
    """
    
    def __init__(self):
        """
        Initializes the VIT_identifier class.
        """
        pass 

    def unzip_file(self, zip_path, extract_to):
        """
        Extracts a ZIP file to a specified directory.

        Args:
            zip_path (str): Path to the ZIP file.
            extract_to (str): Directory where files should be extracted.
        """
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extractall(extract_to)
            print(f"Extracted files to {extract_to}")

    def ensure_model(self, model_name):
        """
        Ensures that the specified model is available locally. If not, downloads and extracts it.

        Args:
            model_name (str): The name of the model to check/download.

        Returns:
            str: The local path of the model.
        """
        model_path = model_info[model_name]["path"]
        url = model_info[model_name]["url"]
        root_model_dir = "IndicPhotoOCR/script_identification/vit"
        model_path = os.path.join(root_model_dir, model_path)

        if not os.path.exists(model_path):
            print(f"Model not found locally. Downloading {model_name} from {url}...")
            response = requests.get(url, stream=True)
            zip_path = os.path.join(model_path, "temp_download.zip")
            os.makedirs(model_path, exist_ok=True)

            with open(zip_path, "wb") as file:
                for chunk in response.iter_content(chunk_size=8192):
                    file.write(chunk)

            with zipfile.ZipFile(zip_path, 'r') as zip_ref:
                zip_ref.extractall(model_path)
            os.remove(zip_path)

            print(f"Downloaded and extracted to {model_path}")
        
        return model_path


[docs]
    def identify(self, image_path, model_name, device):
        """
        Identifies the script in a given image using a ViT model.

        Args:
            image_path (str): Path to the input image.
            model_name (str): Name of the model to be used.
            device (int): Device to run the model on (e.g., 0 for GPU, -1 for CPU).

        Returns:
            str: The predicted script label.
        """
        model_path = self.ensure_model(model_name)
        vit = ViTForImageClassification.from_pretrained(model_path)
        model = pipeline('image-classification', model=vit, feature_extractor=processor, device=device)

        if image_path.endswith((".png", ".jpg", ".jpeg")):
            image = Image.open(image_path)
            output = model(image)
            predicted_label = max(output, key=lambda x: x['score'])['label']
        
        return predicted_label



[docs]
    def predict_batch(self, image_dir, model_name, time_show, output_csv="prediction.csv"):
        """
        Processes a batch of images in a directory and predicts the script for each image.

        Args:
            image_dir (str): Directory containing images.
            model_name (str): Name of the model to be used.
            time_show (bool): Whether to print processing time.
            output_csv (str, optional): Path to save the predictions as a CSV file. Defaults to "prediction.csv".

        Returns:
            str: The output CSV file path containing predictions.
        """
        model_path = self.ensure_model(model_name)
        vit = ViTForImageClassification.from_pretrained(model_path)
        model = pipeline('image-classification', model=vit, feature_extractor=processor, device=0)

        start_time = time.time()
        results = []
        image_count = 0

        for filename in os.listdir(image_dir):
            if filename.endswith((".png", ".jpg", ".jpeg")):
                img_path = os.path.join(image_dir, filename)
                image = Image.open(img_path)
                output = model(image)
                predicted_label = max(output, key=lambda x: x['score'])['label'].capitalize()
                results.append({"Filepath": filename, "Language": predicted_label})
                image_count += 1
        
        elapsed_time = time.time() - start_time
        if time_show:
            print(f"Time taken to process {image_count} images: {elapsed_time:.2f} seconds")
        
        with open(output_csv, mode="w", newline="", encoding="utf-8") as csvfile:
            writer = csv.DictWriter(csvfile, fieldnames=["Filepath", "Language"])
            writer.writeheader()
            writer.writerows(results)
        
        return output_csv




# if __name__ == "__main__":
#     # Argument parser for command line usage
#     parser = argparse.ArgumentParser(description="Image classification using CLIP fine-tuned model")
#     parser.add_argument("--image_path", type=str, help="Path to the input image")
#     parser.add_argument("--image_dir", type=str, help="Path to the input image directory")
#     parser.add_argument("--model_name", type=str, choices=model_info.keys(), help="Name of the model (e.g., hineng, hinengpun, hinengguj)")
#     parser.add_argument("--batch", action="store_true", help="Process images in batch mode if specified")
#     parser.add_argument("--time",type=bool, nargs="?", const=True, default=False, help="Prints the time required to process a batch of images")

#     args = parser.parse_args()


#     # Choose function based on the batch parameter
#     if args.batch:
#         if not args.image_dir:
#             print("Error: image_dir is required when batch is set to True.")
#         else:
#             result = predict_batch(args.image_dir, args.model_name, args.time)
#             print(result)
#     else:
#         if not args.image_path:
#             print("Error: image_path is required when batch is not set.")
#         else:
#             result = predict(args.image_path, args.model_name)
#             print(result)