Spaces:

saherPervaiz
/

cv

Sleeping

saherPervaiz commited on May 9

Commit

7cc953c

verified ·

1 Parent(s): cb19513

Update text_extractor.py

Files changed (1) hide show

text_extractor.py CHANGED Viewed

@@ -1,18 +1,25 @@
 # text_extractor.py
 import docx2txt
-import fitz  # PyMuPDF
 def extract_text_from_file(file_path):
-    if file_path.endswith(".pdf"):
-        return extract_text_from_pdf(file_path)
-    elif file_path.endswith(".docx"):
-        return docx2txt.process(file_path)
-    else:
-        return "Unsupported file type."
-def extract_text_from_pdf(file_path):
-    text = ""
-    with fitz.open(file_path) as doc:
-        for page in doc:
-            text += page.get_text()
-    return text

 # text_extractor.py
+import os
 import docx2txt
+import PyPDF2
 def extract_text_from_file(file_path):
+    ext = os.path.splitext(file_path)[1].lower()
+    if ext == ".pdf":
+        try:
+            with open(file_path, "rb") as f:
+                reader = PyPDF2.PdfReader(f)
+                return " ".join([page.extract_text() or "" for page in reader.pages])
+        except:
+            return "[Error extracting PDF text]"
+    elif ext == ".docx":
+        try:
+            return docx2txt.process(file_path)
+        except:
+            return "[Error extracting DOCX text]"
+    else:
+        return "[Unsupported file type]"