-
Notifications
You must be signed in to change notification settings - Fork 0
/
layout_processing.py
109 lines (93 loc) · 4.2 KB
/
layout_processing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import os
import re
from typing import List, Dict, Optional
from dotenv import load_dotenv
from unstract.llmwhisperer.client import LLMWhispererClient, LLMWhispererClientException
from vote_validation import VoteValidator
# Constants
VALID_VOTE_CHARACTERS = ['X', 'x']
VALID_VOTE_NUMBERS = ['1', '2', '3']
VALID_VOTES = VALID_VOTE_CHARACTERS + VALID_VOTE_NUMBERS
CANDIDATE_NAMES = [
'PAMUDU RANASINGHE', 'KASUN JAYAWARDENA', 'THARINDU FERNANDO',
'SHENAL RATHNAYAKE', 'MAHESHA HETTIARACHCHI', 'RAVINDU WICKRAMASINGHE',
'MANUJA WIJESINGHE', 'ISURU KARUNARATNE'
]
NAME_END_POSITION = 20 # Assumed position where candidate name ends in the ballot line
class LayoutProcessor:
"""Handles the conversion of PDF documents to structured text."""
def __init__(self):
load_dotenv()
api_key = os.getenv('LLMWHISPER_API_KEY')
base_url = os.getenv('LLMWHISPER_BASE_URL')
self.client = LLMWhispererClient(base_url=base_url, api_key=api_key, logging_level="INFO")
def img_to_structured_text(self, pdf_path: str) -> str:
"""Convert a PDF file to structured text using LLMWhisperer."""
try:
result = self.client.whisper(
file_path=pdf_path,
processing_mode='ocr',
output_mode='line-printer',
force_text_processing=False,
line_splitter_tolerance=0.4,
horizontal_stretch_factor=1.2
)
return result["extracted_text"]
except LLMWhispererClientException as e:
return f'PDF conversion failed with error: {e}'
class VoteExtractor:
"""Extract votes and candidate names from text."""
@staticmethod
def extract_vote(line: str) -> Optional[str]:
"""Extract a valid vote from a line of text."""
line = line.replace('[X]', '')
for char in line:
if char in VALID_VOTES:
return char
return None
@staticmethod
def extract_candidate_name(text: str) -> Optional[str]:
"""Extract a candidate name from a line of text."""
processed_names = [''.join(name.split()).upper() for name in CANDIDATE_NAMES]
processed_input = ''.join(text.split()).upper()
for i, processed_name in enumerate(processed_names):
if processed_name in processed_input:
return CANDIDATE_NAMES[i]
return None
class VotingSystem:
"""Orchestrates the entire voting process, from PDF processing to vote extraction."""
def __init__(self, pdf_path: str):
self.pdf_processor = LayoutProcessor()
self.pdf_path = pdf_path
def process_votes(self) -> List[Dict[str, Optional[str]]]:
"""Process the votes from the PDF ballot."""
extracted_text = self.pdf_processor.img_to_structured_text(self.pdf_path)
lines = [line.strip() for line in extracted_text.split('\n')]
vote_dict = []
candidate_list_index = 0
for i, line in enumerate(lines):
candidate_name = VoteExtractor.extract_candidate_name(line)
if candidate_name:
# Try to extract vote from current line, then previous, then next
vote = VoteExtractor.extract_vote(line[NAME_END_POSITION:])
if vote is None:
vote = VoteExtractor.extract_vote(lines[i-1][NAME_END_POSITION:]) or \
VoteExtractor.extract_vote(lines[i+1][NAME_END_POSITION:])
candidate_list_index += 1
record = {
'sheet_position': candidate_list_index,
'candidate_name': candidate_name,
'vote': vote
}
vote_dict.append(record)
return vote_dict
def main():
voting_system = VotingSystem('sample_ballot_papers/vote_1.png')
votes = voting_system.process_votes()
validator = VoteValidator(VALID_VOTE_CHARACTERS, VALID_VOTE_NUMBERS)
print("Extracted votes:")
for vote in votes:
print(vote)
print(f"\nBallot is valid: {validator.is_valid(votes)}")
if __name__ == "__main__":
main()