Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -27,12 +27,11 @@ def xml2text(xml):
|
|
| 27 |
|
| 28 |
def clean_text(content):
|
| 29 |
"""Cleans text content based on the 'clean' parameter."""
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
content = content.strip()
|
| 36 |
return content
|
| 37 |
|
| 38 |
|
|
@@ -66,7 +65,7 @@ def extract_text_from_docx(docx_data, clean=True):
|
|
| 66 |
text += xml2text(zipf.read(fname))
|
| 67 |
|
| 68 |
zipf.close()
|
| 69 |
-
if clean
|
| 70 |
text = clean_text(text)
|
| 71 |
return text, len(text)
|
| 72 |
|
|
@@ -100,7 +99,7 @@ def read_document(file, clean=True):
|
|
| 100 |
for cell in row:
|
| 101 |
if cell.value is not None:
|
| 102 |
content += str(cell.value) + ' '
|
| 103 |
-
if clean
|
| 104 |
content = clean_text(content)
|
| 105 |
return content, len(content)
|
| 106 |
except Exception as e:
|
|
|
|
| 27 |
|
| 28 |
def clean_text(content):
|
| 29 |
"""Cleans text content based on the 'clean' parameter."""
|
| 30 |
+
content = content.replace('\n', ' ')
|
| 31 |
+
content = content.replace('\r', ' ')
|
| 32 |
+
content = content.replace('\t', ' ')
|
| 33 |
+
content = content.replace(' ', ' ') # Replace double spaces with single
|
| 34 |
+
content = content.strip()
|
|
|
|
| 35 |
return content
|
| 36 |
|
| 37 |
|
|
|
|
| 65 |
text += xml2text(zipf.read(fname))
|
| 66 |
|
| 67 |
zipf.close()
|
| 68 |
+
if clean:
|
| 69 |
text = clean_text(text)
|
| 70 |
return text, len(text)
|
| 71 |
|
|
|
|
| 99 |
for cell in row:
|
| 100 |
if cell.value is not None:
|
| 101 |
content += str(cell.value) + ' '
|
| 102 |
+
if clean:
|
| 103 |
content = clean_text(content)
|
| 104 |
return content, len(content)
|
| 105 |
except Exception as e:
|