0% found this document useful (0 votes)

14 views2 pages

Word Extraction-1

Uploaded by

Divakar Pullam Chetti

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as TXT, PDF, TXT or read online on Scribd

0% found this document useful (0 votes)

14 views2 pages

Word Extraction-1

Uploaded by

Divakar Pullam Chetti

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as TXT, PDF, TXT or read online on Scribd

You are on page 1/ 2

import cv2

import numpy as np
from pdf2image import convert_from_path
from PIL import Image
import pytesseract

# Specify the path to the Tesseract executable

pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\
tesseract.exe' # Adjust the path as necessary

# Set a higher limit for decompression bomb detection

Image.MAX_IMAGE_PIXELS = None # Disable the limit

def detect_and_highlight_shapes(pdf_path, output_image_path, dpi=600):

# Convert PDF to images with increased DPI for better resolution
images = convert_from_path(pdf_path, dpi=dpi)

for page_number, img in enumerate(images):

# Convert PIL Image to OpenCV format
img_cv = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)

# Convert to grayscale and apply Gaussian blur

gray = cv2.cvtColor(img_cv, cv2.COLOR_BGR2GRAY)
blurred = cv2.GaussianBlur(gray, (5, 5), 0)

# Apply edge detection

edges = cv2.Canny(blurred, 50, 150)

# Find contours
contours, _ = cv2.findContours(edges, cv2.RETR_TREE,
cv2.CHAIN_APPROX_SIMPLE)

rectangles = []

# Loop through contours and filter for rectangles

for contour in contours:
approx = cv2.approxPolyDP(contour, 0.02 * cv2.arcLength(contour, True),
True)
if len(approx) == 4: # Rectangle has 4 corners
rectangles.append(approx) # Store rectangle contours

# List to hold extracted text

extracted_texts = []

# Highlight rectangles sharing a common side

for i in range(len(rectangles)):
for j in range(i + 1, len(rectangles)):
if share_common_side(rectangles[i], rectangles[j]):
# Draw both rectangles on the original image in green
cv2.drawContours(img_cv, [rectangles[i]], -1, (0, 255, 0), 3)
# Draw rectangle in green
cv2.drawContours(img_cv, [rectangles[j]], -1, (0, 255, 0), 3)
# Draw rectangle in green

# Extract text from the bounding boxes

for rect in [rectangles[i], rectangles[j]]:
# Get bounding box coordinates
x, y, w, h = cv2.boundingRect(rect)
# Crop the region of interest
roi = img_cv[y:y+h, x:x+w]

# Use Tesseract to extract text

text = pytesseract.image_to_string(roi)
extracted_texts.append(text.strip())

# Save highlighted image

output_image_file = f"{output_image_path}/highlighted_page_{page_number +
1}.png"
cv2.imwrite(output_image_file, img_cv)
print(f"Processed page {page_number + 1}, saved highlighted image as
{output_image_file}")

# Print the extracted texts for the page

print("Extracted Texts:")
for text in extracted_texts:
if text: # Print only non-empty strings
print(text)

def share_common_side(rect1, rect2):

"""Check if two rectangles share a common side."""
rect1_vertices = [tuple(vertex[0]) for vertex in rect1]
rect2_vertices = [tuple(vertex[0]) for vertex in rect2]

# Check each edge of rect1 against each edge of rect2

for i in range(4):
edge1_start = rect1_vertices[i]
edge1_end = rect1_vertices[(i + 1) % 4]

for j in range(4):
edge2_start = rect2_vertices[j]
edge2_end = rect2_vertices[(j + 1) % 4]

if (edge1_start == edge2_start and edge1_end == edge2_end) or \

(edge1_start == edge2_end and edge1_end == edge2_start):
return True

return False

# Usage
pdf_file_path = r'C:\Users\pcdiv\OneDrive\Desktop\Condensate System P&ID.pdf'
output_image_folder = r'C:\Users\pcdiv\OneDrive\Desktop' # Folder where images
will be saved
detect_and_highlight_shapes(pdf_file_path, output_image_folder)

TensorFlow深度学习项目实战: Chinese Edition
From Everand
TensorFlow深度学习项目实战: Chinese Edition
Posts & Telecom Press
No ratings yet
MY Series: Miniature Power Relays
No ratings yet
MY Series: Miniature Power Relays
66 pages
Merged Notebooks
No ratings yet
Merged Notebooks
86 pages
1. Manual English
No ratings yet
1. Manual English
111 pages
F) maybe is full script complet
No ratings yet
F) maybe is full script complet
35 pages
Lazarus LMath0 - 6
No ratings yet
Lazarus LMath0 - 6
173 pages
RFIDIntegrated Smart Inventory System With Instant Web Updates
No ratings yet
RFIDIntegrated Smart Inventory System With Instant Web Updates
106 pages
21BAI1724_EX-05-06
No ratings yet
21BAI1724_EX-05-06
18 pages
CV Lab Manual
No ratings yet
CV Lab Manual
45 pages
'/content/fruit - JPG' 'No. of Pixels ' 'Shape '
No ratings yet
'/content/fruit - JPG' 'No. of Pixels ' 'Shape '
80 pages
yolo_tensorflow
No ratings yet
yolo_tensorflow
13 pages
Lecture 6 AI Summary
No ratings yet
Lecture 6 AI Summary
34 pages
Sutadian Et Al., 2016
No ratings yet
Sutadian Et Al., 2016
29 pages
Bit 22034
No ratings yet
Bit 22034
18 pages
Renaissance Middle School Control Submittal 05-20-2022
No ratings yet
Renaissance Middle School Control Submittal 05-20-2022
44 pages
CV1
No ratings yet
CV1
10 pages
QPED-9742R1 Pipe Line Construction
100% (3)
QPED-9742R1 Pipe Line Construction
26 pages
C) le script but not complet partie 1
No ratings yet
C) le script but not complet partie 1
13 pages
Lab Assignment 1 Image Processing Using OpenCV
No ratings yet
Lab Assignment 1 Image Processing Using OpenCV
11 pages
Exp.3
No ratings yet
Exp.3
21 pages
En SG Reihenklemmen LoRes
No ratings yet
En SG Reihenklemmen LoRes
144 pages
EMERSON
No ratings yet
EMERSON
31 pages
Section 10 Instrumentation Control and Automation Rev 01 PDF
No ratings yet
Section 10 Instrumentation Control and Automation Rev 01 PDF
137 pages
Ip Lab Programs
No ratings yet
Ip Lab Programs
34 pages
Touareg Body Manual: Student Reference
67% (3)
Touareg Body Manual: Student Reference
39 pages
G6 Electricity and Magnetism - Revision Booklet and ANS
No ratings yet
G6 Electricity and Magnetism - Revision Booklet and ANS
21 pages
Accessories For Lubrication Systems: Product Catalogue
100% (1)
Accessories For Lubrication Systems: Product Catalogue
28 pages
Nis 17251HB2
No ratings yet
Nis 17251HB2
28 pages
Automatic Number Plate Recognition System Roadmap
No ratings yet
Automatic Number Plate Recognition System Roadmap
8 pages
Dip
No ratings yet
Dip
43 pages
BCO Past Paper Attempt Timed
No ratings yet
BCO Past Paper Attempt Timed
11 pages
Drawing Functions
No ratings yet
Drawing Functions
23 pages
Chapter 7
No ratings yet
Chapter 7
18 pages
Ste - Unit3 - Presentation Updated
No ratings yet
Ste - Unit3 - Presentation Updated
31 pages
Gulshan - DIP - Lab - Programs (11 To 20)
No ratings yet
Gulshan - DIP - Lab - Programs (11 To 20)
37 pages
Networking OSCS
No ratings yet
Networking OSCS
19 pages
IVA_record
No ratings yet
IVA_record
31 pages
TUP HRM 111 Risk Management As Applied To Safety, Security and Sanitation
59% (17)
TUP HRM 111 Risk Management As Applied To Safety, Security and Sanitation
7 pages
fai_exp10
No ratings yet
fai_exp10
12 pages
ip3
No ratings yet
ip3
7 pages
Road Lane Line Detection Using OpenCV Py
No ratings yet
Road Lane Line Detection Using OpenCV Py
12 pages
Corporate Domination: Chapter Two
No ratings yet
Corporate Domination: Chapter Two
46 pages
Accounting For Pensions and Postretirement Benefits: Intermediate Accounting 12th Edition Kieso, Weygandt, and Warfield
No ratings yet
Accounting For Pensions and Postretirement Benefits: Intermediate Accounting 12th Edition Kieso, Weygandt, and Warfield
49 pages
Name-Bhavya Jain College id-19CS19 Batch-C1 Digital Image Processing Lab
No ratings yet
Name-Bhavya Jain College id-19CS19 Batch-C1 Digital Image Processing Lab
23 pages
ad-cryogenic_lowres
No ratings yet
ad-cryogenic_lowres
4 pages
Laporan Final Project
No ratings yet
Laporan Final Project
10 pages
Project On Opencv New
No ratings yet
Project On Opencv New
11 pages
ITO vs. Balabhai Nanavati Hospital ITAT Mumbai
No ratings yet
ITO vs. Balabhai Nanavati Hospital ITAT Mumbai
25 pages
cv 1.3
No ratings yet
cv 1.3
4 pages
dip_lab
No ratings yet
dip_lab
5 pages
Agatha Christie's World
No ratings yet
Agatha Christie's World
14 pages
2
No ratings yet
2
4 pages
Lab 04 Digital Image Processing Practice
No ratings yet
Lab 04 Digital Image Processing Practice
9 pages
Dip Lab2
No ratings yet
Dip Lab2
5 pages
FEB 2024 News Program Ranker
No ratings yet
FEB 2024 News Program Ranker
8 pages
Exp4 2
No ratings yet
Exp4 2
7 pages
cvrlabmanual
No ratings yet
cvrlabmanual
30 pages
Ai 1
No ratings yet
Ai 1
36 pages
Bio 127 Lecture 4b Culture Media
No ratings yet
Bio 127 Lecture 4b Culture Media
19 pages
Start: Poke The Box: The Workbook
No ratings yet
Start: Poke The Box: The Workbook
23 pages
Programs 8,11,12
No ratings yet
Programs 8,11,12
5 pages
JournalofKrishiVignan Article
No ratings yet
JournalofKrishiVignan Article
6 pages
Dip Lab Short Code-1
No ratings yet
Dip Lab Short Code-1
7 pages
5SL62047RC Datasheet en
No ratings yet
5SL62047RC Datasheet en
5 pages
5SL41047RC Datasheet en
No ratings yet
5SL41047RC Datasheet en
5 pages
Flakt Dryer
No ratings yet
Flakt Dryer
8 pages
DIP_Manual
No ratings yet
DIP_Manual
27 pages
AliCV A2
No ratings yet
AliCV A2
6 pages
Air Canvas Project
No ratings yet
Air Canvas Project
6 pages
Computervision Practical
No ratings yet
Computervision Practical
13 pages
C
No ratings yet
C
2 pages
GPS Walker RTK Proposal
No ratings yet
GPS Walker RTK Proposal
5 pages
Assignment 3
No ratings yet
Assignment 3
6 pages
B120041 IVP Assignment
No ratings yet
B120041 IVP Assignment
14 pages
TCAS-Shark Tank
No ratings yet
TCAS-Shark Tank
4 pages
1SBH143001R8140 Nl40e 24v DC
No ratings yet
1SBH143001R8140 Nl40e 24v DC
4 pages
Image segmentation
No ratings yet
Image segmentation
2 pages
LẬP TRÌNH XỬ LÝ ẢNH
No ratings yet
LẬP TRÌNH XỬ LÝ ẢNH
8 pages
OCR
No ratings yet
OCR
2 pages
Calısan Kod Renksiz
No ratings yet
Calısan Kod Renksiz
2 pages
Air Act 1981 Relevant Provisions
No ratings yet
Air Act 1981 Relevant Provisions
26 pages
Datasheet Coriolis Flowmeter
0% (1)
Datasheet Coriolis Flowmeter
2 pages
Refined Shape
No ratings yet
Refined Shape
2 pages
Exp 1
No ratings yet
Exp 1
46 pages
CV Lab File
No ratings yet
CV Lab File
39 pages
Program 9 10
No ratings yet
Program 9 10
2 pages
AKNipProgram 1
No ratings yet
AKNipProgram 1
7 pages
TMI110 Datasheet B212427EN E
No ratings yet
TMI110 Datasheet B212427EN E
2 pages
PID PIT
No ratings yet
PID PIT
1 page
CAP 5416
No ratings yet
CAP 5416
12 pages
JJJ
No ratings yet
JJJ
6 pages
Shape Highlighter
No ratings yet
Shape Highlighter
1 page
Dimension
No ratings yet
Dimension
2 pages
Bài Tập Về Nhà Số 4
No ratings yet
Bài Tập Về Nhà Số 4
4 pages
Confined Space Questions and Answers
100% (5)
Confined Space Questions and Answers
8 pages
Management of Post-Keratoplasty Astigmatism
No ratings yet
Management of Post-Keratoplasty Astigmatism
11 pages
PLC Specialist - JD
No ratings yet
PLC Specialist - JD
3 pages
Cost Breakup of Kavach Work
No ratings yet
Cost Breakup of Kavach Work
2 pages
Comparison of Outer Rotor Radial Flux and Axial Flux PM Motors For CMG Application
No ratings yet
Comparison of Outer Rotor Radial Flux and Axial Flux PM Motors For CMG Application
6 pages
Business Proposal
No ratings yet
Business Proposal
4 pages
Python Project
No ratings yet
Python Project
2 pages
Detection ORIGINAL
No ratings yet
Detection ORIGINAL
3 pages
Fertilization and Irrigation Theory and Best Practices. Table of Contents 2021
No ratings yet
Fertilization and Irrigation Theory and Best Practices. Table of Contents 2021
4 pages
Duplichecker Plagiarism Report
No ratings yet
Duplichecker Plagiarism Report
4 pages
Tap Testing
No ratings yet
Tap Testing
4 pages
Southco Push To Close Latches: Touch Latch, Large Size
No ratings yet
Southco Push To Close Latches: Touch Latch, Large Size
1 page
TP02 - Image Processing Using Python-OpenCV
No ratings yet
TP02 - Image Processing Using Python-OpenCV
3 pages
Cpi Suicide After 105600001 2 2 Redacted
No ratings yet
Cpi Suicide After 105600001 2 2 Redacted
14 pages
Business Plan
No ratings yet
Business Plan
3 pages
Interventional Pain Medicine - 1st Edition All Format Download
100% (12)
Interventional Pain Medicine - 1st Edition All Format Download
16 pages
HVAC IDOM Pages Deleted
No ratings yet
HVAC IDOM Pages Deleted
14 pages
IP_LAB[1]
No ratings yet
IP_LAB[1]
8 pages
Cmo 12, S. 2008 - Approved Ps For Bsge
No ratings yet
Cmo 12, S. 2008 - Approved Ps For Bsge
15 pages
Data Center Projects Commissioning Transcript
100% (1)
Data Center Projects Commissioning Transcript
22 pages

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.

Word Extraction-1

Uploaded by

Word Extraction-1

Uploaded by

import cv2

# Specify the path to the Tesseract executable

# Set a higher limit for decompression bomb detection

def detect_and_highlight_shapes(pdf_path, output_image_path, dpi=600):

for page_number, img in enumerate(images):

# Convert to grayscale and apply Gaussian blur

# Apply edge detection

# Loop through contours and filter for rectangles

# List to hold extracted text

# Highlight rectangles sharing a common side

# Extract text from the bounding boxes

# Use Tesseract to extract text

# Save highlighted image

# Print the extracted texts for the page

def share_common_side(rect1, rect2):

# Check each edge of rect1 against each edge of rect2

if (edge1_start == edge2_start and edge1_end == edge2_end) or \

You might also like

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.