0% found this document useful (0 votes)
14 views2 pages

Word Extraction-1

Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
14 views2 pages

Word Extraction-1

Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
You are on page 1/ 2

import cv2

import numpy as np
from pdf2image import convert_from_path
from PIL import Image
import pytesseract

# Specify the path to the Tesseract executable


pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\
tesseract.exe' # Adjust the path as necessary

# Set a higher limit for decompression bomb detection


Image.MAX_IMAGE_PIXELS = None # Disable the limit

def detect_and_highlight_shapes(pdf_path, output_image_path, dpi=600):


# Convert PDF to images with increased DPI for better resolution
images = convert_from_path(pdf_path, dpi=dpi)

for page_number, img in enumerate(images):


# Convert PIL Image to OpenCV format
img_cv = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)

# Convert to grayscale and apply Gaussian blur


gray = cv2.cvtColor(img_cv, cv2.COLOR_BGR2GRAY)
blurred = cv2.GaussianBlur(gray, (5, 5), 0)

# Apply edge detection


edges = cv2.Canny(blurred, 50, 150)

# Find contours
contours, _ = cv2.findContours(edges, cv2.RETR_TREE,
cv2.CHAIN_APPROX_SIMPLE)

rectangles = []

# Loop through contours and filter for rectangles


for contour in contours:
approx = cv2.approxPolyDP(contour, 0.02 * cv2.arcLength(contour, True),
True)
if len(approx) == 4: # Rectangle has 4 corners
rectangles.append(approx) # Store rectangle contours

# List to hold extracted text


extracted_texts = []

# Highlight rectangles sharing a common side


for i in range(len(rectangles)):
for j in range(i + 1, len(rectangles)):
if share_common_side(rectangles[i], rectangles[j]):
# Draw both rectangles on the original image in green
cv2.drawContours(img_cv, [rectangles[i]], -1, (0, 255, 0), 3)
# Draw rectangle in green
cv2.drawContours(img_cv, [rectangles[j]], -1, (0, 255, 0), 3)
# Draw rectangle in green

# Extract text from the bounding boxes


for rect in [rectangles[i], rectangles[j]]:
# Get bounding box coordinates
x, y, w, h = cv2.boundingRect(rect)
# Crop the region of interest
roi = img_cv[y:y+h, x:x+w]

# Use Tesseract to extract text


text = pytesseract.image_to_string(roi)
extracted_texts.append(text.strip())

# Save highlighted image


output_image_file = f"{output_image_path}/highlighted_page_{page_number +
1}.png"
cv2.imwrite(output_image_file, img_cv)
print(f"Processed page {page_number + 1}, saved highlighted image as
{output_image_file}")

# Print the extracted texts for the page


print("Extracted Texts:")
for text in extracted_texts:
if text: # Print only non-empty strings
print(text)

def share_common_side(rect1, rect2):


"""Check if two rectangles share a common side."""
rect1_vertices = [tuple(vertex[0]) for vertex in rect1]
rect2_vertices = [tuple(vertex[0]) for vertex in rect2]

# Check each edge of rect1 against each edge of rect2


for i in range(4):
edge1_start = rect1_vertices[i]
edge1_end = rect1_vertices[(i + 1) % 4]

for j in range(4):
edge2_start = rect2_vertices[j]
edge2_end = rect2_vertices[(j + 1) % 4]

if (edge1_start == edge2_start and edge1_end == edge2_end) or \


(edge1_start == edge2_end and edge1_end == edge2_start):
return True

return False

# Usage
pdf_file_path = r'C:\Users\pcdiv\OneDrive\Desktop\Condensate System P&ID.pdf'
output_image_folder = r'C:\Users\pcdiv\OneDrive\Desktop' # Folder where images
will be saved
detect_and_highlight_shapes(pdf_file_path, output_image_folder)

You might also like

pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy