Spaces:

protae5544
/

typhoon-ocr

Sleeping

App Files Files Community

typhoon-ocr / app.py

protae5544

Update app.py

107f2cd verified 6 months ago

raw

history blame

10.8 kB

	import base64
	from io import BytesIO
	import json
	import os
	import PyPDF2 # เพิ่มไลบรารีสำหรับอ่าน PDF
	from openai import OpenAI
	from dotenv import load_dotenv
	from typhoon_ocr import prepare_ocr_messages
	import gradio as gr
	from PIL import Image

	# โหลด environment variables
	load_dotenv()

	# ตั้งค่า OpenAI API
	openai = OpenAI(
	base_url=os.getenv("TYPHOON_BASE_URL"),
	api_key=os.getenv("TYPHOON_API_KEY")
	)

	# ตั้งค่า Theme (เดิม)
	theme = gr.themes.Soft(
	primary_hue=gr.themes.Color(
	c50="#f7f7fd",
	c100="#dfdef8",
	c200="#c4c1f2",
	c300="#a29eea",
	c400="#8f8ae6",
	c500="#756fe0",
	c600="#635cc1",
	c700="#4f4a9b",
	c800="#433f83",
	c900="#302d5e",
	c950="#302d5e",
	),
	secondary_hue="rose",
	neutral_hue="stone",
	)

	# ตัวแปรสำหรับบันทึกผลลัพธ์
	OUTPUT_FILE = "ocr_results.txt"

	def save_ocr_result(text):
	"""บันทึกผลลัพธ์ OCR แบบต่อเนื่องในไฟล์เดียว"""
	with open(OUTPUT_FILE, "a", encoding="utf-8") as f:
	f.write(text + "\n\n")
	return OUTPUT_FILE

	def clear_output_file():
	"""ล้างไฟล์ผลลัพธ์เก่า"""
	with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
	f.write("")

	def get_pdf_page_count(pdf_path):
	"""หาจำนวนหน้าของ PDF"""
	with open(pdf_path, 'rb') as f:
	reader = PyPDF2.PdfReader(f)
	return len(reader.pages)
	return 0

	def process_pdf(pdf_or_image_file, task_type):
	if pdf_or_image_file is None:
	return None, "No file uploaded"

	orig_filename = pdf_or_image_file.name
	combined_text = ""
	image_pil = None # ใช้เก็บภาพหน้าแรกของ PDF

	try:
	# ตรวจสอบว่าเป็น PDF หรือไม่
	if orig_filename.lower().endswith(".pdf"):
	total_pages = get_pdf_page_count(orig_filename)

	if total_pages == 0:
	return None, "ไม่สามารถอ่านจำนวนหน้าของ PDF ได้"

	# ประมวลผลทุกหน้า
	for page_num in range(1, total_pages + 1):
	# เตรียมข้อมูลสำหรับ OCR
	messages = prepare_ocr_messages(
	pdf_or_image_path=orig_filename,
	task_type=task_type,
	target_image_dim=1800,
	target_text_length=8000,
	page_num=page_num
	)

	# ดึงภาพหน้าแรก
	if page_num == 1:
	image_url = messages[0]["content"][1]["image_url"]["url"]
	image_base64 = image_url.replace("data:image/png;base64,", "")
	image_pil = Image.open(BytesIO(base64.b64decode(image_base64)))

	# ส่งไป API
	response = openai.chat.completions.create(
	model=os.getenv("TYPHOON_OCR_MODEL"),
	messages=messages,
	max_tokens=16384,
	extra_body={
	"repetition_penalty": 1.2,
	"temperature": 0.1,
	"top_p": 0.6,
	},
	)
	text_output = response.choices[0].message.content

	# ดึง natural_text
	try:
	json_data = json.loads(text_output)
	markdown_out = json_data.get('natural_text', "").replace("<figure>", "").replace("</figure>", "")
	except Exception as e:
	markdown_out = f"⚠️ Could not extract `natural_text` from output.\nError: {str(e)}"

	# รวมผลลัพธ์ทุกหน้า
	combined_text += f"[Page {page_num}]\n{markdown_out}\n\n"

	# บันทึกผลลัพธ์ทั้งหมดลงไฟล์
	save_ocr_result(combined_text)
	return image_pil, combined_text, gr.File.update(value=OUTPUT_FILE)

	# หากเป็นไฟล์ภาพ
	else:
	# ประมวลผลหน้าเดียว
	messages = prepare_ocr_messages(
	pdf_or_image_path=orig_filename,
	task_type=task_type,
	target_image_dim=1800,
	target_text_length=8000,
	page_num=1
	)

	# ดึงภาพ
	image_url = messages[0]["content"][1]["image_url"]["url"]
	image_base64 = image_url.replace("data:image/png;base64,", "")
	image_pil = Image.open(BytesIO(base64.b64decode(image_base64)))

	# ส่งไป API
	response = openai.chat.completions.create(
	model=os.getenv("TYPHOON_OCR_MODEL"),
	messages=messages,
	max_tokens=16384,
	extra_body={
	"repetition_penalty": 1.2,
	"temperature": 0.1,
	"top_p": 0.6,
	},
	)
	text_output = response.choices[0].message.content

	# ดึง natural_text
	try:
	json_data = json.loads(text_output)
	markdown_out = json_data.get('natural_text', "").replace("<figure>", "").replace("</figure>", "")
	except Exception as e:
	markdown_out = f"⚠️ Could not extract `natural_text` from output.\nError: {str(e)}"

	# บันทึกผลลัพธ์ลงไฟล์
	save_ocr_result(markdown_out)
	return image_pil, markdown_out, gr.File.update(value=OUTPUT_FILE)

	except Exception as e:
	return None, f"Error processing file: {str(e)}", None

	# สร้าง UI
	with gr.Blocks(theme=theme) as demo:
	title = gr.HTML("""
	<h1>Typhoon OCR</h1>
	<ul>
	<li>🤗 <b>Model weights</b>: <a href="https://huggingface.co/scb10x/typhoon-ocr-7b" target="_blank">https://huggingface.co/scb10x/typhoon-ocr-7b</a></li>
	</ul>
	<br />
	<details>
	<summary><strong>Disclaimer</strong></summary>
	The responses generated by this AI system are autonomously constructed and do not necessarily reflect the views or positions of the developing organizations, their affiliates, or any of their employees. These AI-generated responses do not represent those of the organizations. The organizations do not endorse, support, sanction, encourage, verify, or agree with the comments, opinions, or statements generated by this AI. The information produced by this AI is not intended to malign any religion, ethnic group, club, organization, company, individual, anyone, or anything. It is not the intent of the organizations to malign any group or individual. The AI operates based on its programming and training data and its responses should not be interpreted as the explicit intent or opinion of the organizations.
	</details>
	<br />
	<details>
	<summary><strong>Terms of use</strong></summary>
	By using this service, users are required to agree to the following terms: The service is a research preview intended for non-commercial use only. It only provides limited safety measures and may generate offensive content. It must not be used for any illegal, harmful, violent, racist, or sexual purposes. Vision language models are prone to hallucinations to a greater extent compared to text-only LLMs.
	</details>
	<br />
	<details>
	<summary><strong>License</strong></summary>
	This project utilizes certain datasets and checkpoints that are subject to their respective original licenses. Users must comply with all terms and conditions of these original licenses. The content of this project itself is licensed under the Apache license 2.0.
	</details>
	""")

	with gr.Row():
	with gr.Column(scale=1):
	# อัปโหลดไฟล์
	pdf_input = gr.File(label="📄 Upload Image file or PDF file", file_types=[".pdf", ".png", ".jpg", ".jpeg"])

	# เลือก Task
	with gr.Group(elem_classes=["task-background"]):
	task_dropdown = gr.Radio(["default", "structure"], label="🎯 Select Task", value="default")
	gr.HTML("""
	<p><b>default</b>: This mode works for most cases and is recommended for files without a clear template such as infographics.</p>
	<p><b>structure</b>: This mode offers improved performance for complex layout documents such as those containing images, tables and forms.</p>
	<p>We recommend trying both and see which one works better for your use case.</p>
	""", elem_classes=["task-dropdown-info"])
	demo.css = """
	.task-background {
	background: var(--block-background-fill) !important;
	}
	.task-background > * {
	background: var(--block-background-fill) !important;
	}
	.task-dropdown-info {
	padding: 0 16px;
	font-size: 12px;
	}
	"""
	# ปุ่มรัน
	run_button = gr.Button("🚀 Run")

	# แสดงภาพ
	image_output = gr.Image(label="📸 Preview Image", type="pil")

	with gr.Column(scale=2):
	# แสดงผลลัพธ์ Markdown
	markdown_output = gr.Markdown(label='Markdown Result', show_label=True)

	# ปุ่มดาวน์โหลดไฟล์
	download_button = gr.File(label="📥 ดาวน์โหลดผลลัพธ์ทั้งหมด (Text File)", interactive=False)

	# เชื่อมต่อ UI กับฟังก์ชัน
	run_button.click(
	fn=process_pdf,
	inputs=[pdf_input, task_dropdown],
	outputs=[image_output, markdown_output, download_button]
	)

	# เริ่มต้นใหม่ (ล้างไฟล์ผลลัพธ์เก่า)
	clear_output_file()

	# รันแอป
	demo.launch(share=False)