Spaces:
Running
Running
| import re | |
| from typing import Optional | |
| from urllib.parse import urljoin | |
| import html2text | |
| from bs4 import BeautifulSoup, Tag | |
| class Converter: | |
| def html_to_markdown( | |
| html: str, | |
| base_url: str, | |
| parser_features="html.parser", | |
| **conversion_options, | |
| ) -> str: | |
| soup = BeautifulSoup(html, parser_features) | |
| cleaned_soup = Converter.replace_media_with_markdown(soup, base_url) | |
| return Converter.convert_html_to_markdown( | |
| str(cleaned_soup), **conversion_options | |
| ) | |
| def replace_media_with_markdown( | |
| soup: BeautifulSoup, base_url: str | |
| ) -> BeautifulSoup: | |
| data_uri_pattern = re.compile( | |
| r"data:([a-zA-Z]+/[a-zA-Z+.-]+)?(;base64)?,[^,]*" | |
| ) | |
| def process_media_tag(tag: Tag, media_type: str) -> Optional[str]: | |
| media_url = tag.get("src", "") | |
| if data_uri_pattern.match(media_url): | |
| tag.decompose() | |
| return None | |
| if not media_url.startswith(("http://", "https://")): | |
| media_url = urljoin(base_url, media_url.lstrip("/")) | |
| alt_text = f"{media_type}: {tag.get('alt', '') or tag.get('title', '')}".strip() | |
| return f"" | |
| for img in soup.find_all("img"): | |
| markdown_img = process_media_tag(img, "Image") | |
| if markdown_img: | |
| img.replace_with(markdown_img) | |
| for video in soup.find_all("video"): | |
| markdown_video = process_media_tag(video, "Video") | |
| if markdown_video: | |
| video.replace_with(markdown_video) | |
| return soup | |
| def convert_html_to_markdown(cleaned_html: str, **options) -> str: | |
| converter = html2text.HTML2Text() | |
| for key, value in options.items(): | |
| setattr(converter, key, value) | |
| return converter.handle(cleaned_html) | |