import pytesseract from pytesseract import Output from PIL import Image import cv2 import os

Tesseract का पथ (सुनिश्चित करें कि Tesseract इंस्टॉल हो)

pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

इमेज प्रोसेसिंग के लिए फंक्शन

def process_image(image_path, output_format="txt"): try: # इमेज को लोड करें और प्रोसेस करें image = cv2.imread(image_path) gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

    # नॉइज़ रिमूवल
    processed_image = cv2.GaussianBlur(gray, (5, 5), 0)

    # OCR से टेक्स्ट पहचान
    text_data = pytesseract.image_to_string(processed_image, lang="eng+hin+ben", config='--psm 6')

    # टेक्स्ट फॉर्मेट करना
    print("\nExtracted Text:\n", text_data)

    # आउटपुट फॉर्मेट के अनुसार फाइल सेव करना
    if output_format == "txt":
        with open("output_text.txt", "w", encoding="utf-8") as file:
            file.write(text_data)
        print("Text saved as 'output_text.txt'.")
    
    elif output_format == "pdf":
        pdf_output = pytesseract.image_to_pdf_or_hocr(processed_image, extension='pdf')
        with open("output_document.pdf", "wb") as file:
            file.write(pdf_output)
        print("PDF saved as 'output_document.pdf'.")

except Exception as e:
    print("Error:", e)

इमेज का पथ

image_path = "sample_image.jpg" # अपनी इमेज का पथ दें output_format = "pdf" # 'txt' या 'pdf' में से चुनें

फंक्शन को कॉल करें

process_image(image_path, output_format)