Spaces:
Build error
Build error
File size: 5,787 Bytes
9fca407 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 |
import requests
import re
from fpdf import FPDF
import os
import textract
# --- Configuration ---
AI_SERVICE_URL = "http://localhost:8000"
INPUT_DOC_PATH = "Doreen.doc"
OUTPUT_PDF_PATH = "Doreen DeFio_Dr. Daniel Rich_Report_Generated.pdf"
def correct_text_via_api(endpoint: str, text: str) -> str:
try:
response = requests.post(f"{AI_SERVICE_URL}/{endpoint}", json={"text": text})
response.raise_for_status()
return response.json()["corrected_text"]
except requests.exceptions.RequestException as e:
print(f"Error calling AI service at endpoint '{endpoint}': {e}")
return text
def extract_text_from_doc(filepath):
if not os.path.exists(filepath):
raise FileNotFoundError(f"Input file not found at: {filepath}")
try:
text_bytes = textract.process(filepath)
return text_bytes.decode('utf-8')
except Exception as e:
print(f"Error reading document with textract: {e}")
return None
def parse_and_correct_text(raw_text):
structured_data = {}
current_section = None
buffer = []
key_value_pattern = re.compile(
r'^\s*(Client Name|Date of Exam|Date of Accident|Examinee|Observed By|Performed By|Specialty|Facility|Facility Description|Appointment Scheduled For|Arrived at Office|Admitted to Exam Room|Intake Start|Exam Start|Exam End|Length of Exam|Total Length of Visit|Others Present|Description of IME physician|Layout of Exam Room|Did IME Physician Have Examinees Medical Records)\s*:\s*(.*)',
re.IGNORECASE | re.DOTALL
)
section_headers = ["Intake:", "Exam:"]
lines = [line.strip() for line in raw_text.split('\n') if line.strip()]
i = 0
while i < len(lines):
line = lines[i]
if line in section_headers:
if current_section and buffer:
full_paragraph = " ".join(buffer)
grammar_corrected = correct_text_via_api("correct_grammar", full_paragraph)
final_corrected = correct_text_via_api("correct_gender", grammar_corrected)
structured_data[current_section] = final_corrected
current_section = line.replace(":", "").strip()
buffer = []
i += 1
continue
match = key_value_pattern.match(line)
if match:
key, value = map(str.strip, match.groups())
if not value and (i + 1) < len(lines) and not key_value_pattern.match(lines[i+1]) and lines[i+1] not in section_headers:
value = lines[i+1]
i += 1
structured_data[key] = correct_text_via_api("correct_grammar", value)
elif current_section:
buffer.append(line)
i += 1
if current_section and buffer:
full_paragraph = " ".join(buffer)
grammar_corrected = correct_text_via_api("correct_grammar", full_paragraph)
final_corrected = correct_text_via_api("correct_gender", grammar_corrected)
structured_data[current_section] = final_corrected
return structured_data
class PDF(FPDF):
def header(self):
self.set_font("DejaVu", "B", 15)
self.cell(0, 10, 'IME WatchDog Report', 0, 1, 'C')
self.ln(10)
def footer(self):
self.set_y(-15)
self.set_font("DejaVu", "I", 8)
self.cell(0, 10, f'Page {self.page_no()}', 0, 0, 'C')
def generate_pdf(data, output_path):
pdf = PDF()
# --- FIX: Add a Unicode font that supports characters like β ---
# You may need to provide the path to the .ttf font file if not in a standard location.
# This example assumes it can be found.
try:
pdf.add_font("DejaVu", "", "DejaVuSans.ttf", uni=True)
pdf.add_font("DejaVu", "B", "DejaVuSans-Bold.ttf", uni=True)
pdf.add_font("DejaVu", "I", "DejaVuSans-Oblique.ttf", uni=True)
except RuntimeError:
print("---")
print("β οΈ FONT WARNING: DejaVuSans.ttf not found.")
print("The PDF will be generated, but may have character issues.")
print("Please download the DejaVu font family and place the .ttf files in this directory.")
print("---")
pdf.add_page()
pdf.set_font("DejaVu", "", 12)
key_order = [
"Client Name", "Date of Exam", "Date of Accident", "Examinee", "Observed By",
"Performed By", "Specialty", "Facility", "Facility Description",
"Appointment Scheduled For", "Arrived at Office", "Admitted to Exam Room",
"Intake Start", "Exam Start", "Exam End", "Length of Exam", "Total Length of Visit",
"Others Present", "Description of IME physician", "Layout of Exam Room",
"Did IME Physician Have Examinees Medical Records", "Intake", "Exam"
]
for key in key_order:
if key in data:
value = data[key]
pdf.set_font("DejaVu", "B", 12)
pdf.cell(0, 10, f"{key}:", ln=True)
pdf.set_font("DejaVu", "", 12)
pdf.multi_cell(0, 8, str(value))
pdf.ln(4)
pdf.output(output_path)
print(f"β
Successfully generated PDF report at: {output_path}")
if __name__ == "__main__":
print("--- Starting Document Transformation Pipeline ---")
if os.path.exists(INPUT_DOC_PATH):
print(f"1. Extracting text from '{INPUT_DOC_PATH}' using textract...")
raw_document_text = extract_text_from_doc(INPUT_DOC_PATH)
if raw_document_text:
print("2. Parsing and correcting text via AI microservice...")
corrected_data = parse_and_correct_text(raw_document_text)
print(f"3. Generating PDF report '{OUTPUT_PDF_PATH}'...")
generate_pdf(corrected_data, OUTPUT_PDF_PATH)
print("--- Pipeline Finished ---")
else:
print(f"β ERROR: Input file not found: '{INPUT_DOC_PATH}'")
|