Server IP : 85.214.239.14 / Your IP : 13.58.245.158 Web Server : Apache/2.4.62 (Debian) System : Linux h2886529.stratoserver.net 4.9.0 #1 SMP Tue Jan 9 19:45:01 MSK 2024 x86_64 User : www-data ( 33) PHP Version : 7.4.18 Disable Function : pcntl_alarm,pcntl_fork,pcntl_waitpid,pcntl_wait,pcntl_wifexited,pcntl_wifstopped,pcntl_wifsignaled,pcntl_wifcontinued,pcntl_wexitstatus,pcntl_wtermsig,pcntl_wstopsig,pcntl_signal,pcntl_signal_get_handler,pcntl_signal_dispatch,pcntl_get_last_error,pcntl_strerror,pcntl_sigprocmask,pcntl_sigwaitinfo,pcntl_sigtimedwait,pcntl_exec,pcntl_getpriority,pcntl_setpriority,pcntl_async_signals,pcntl_unshare, MySQL : OFF | cURL : OFF | WGET : ON | Perl : ON | Python : ON | Sudo : ON | Pkexec : OFF Directory : /srv/modoboa/env/lib64/python3.5/site-packages/html2text/ |
Upload File : |
import html.entities from typing import Dict, List, Optional from . import config unifiable_n = { html.entities.name2codepoint[k]: v for k, v in config.UNIFIABLE.items() if k != "nbsp" } def hn(tag: str) -> int: if tag[0] == "h" and len(tag) == 2: n = tag[1] if "0" < n <= "9": return int(n) return 0 def dumb_property_dict(style: str) -> Dict[str, str]: """ :returns: A hash of css attributes """ return { x.strip().lower(): y.strip().lower() for x, y in [z.split(":", 1) for z in style.split(";") if ":" in z] } def dumb_css_parser(data: str) -> Dict[str, Dict[str, str]]: """ :type data: str :returns: A hash of css selectors, each of which contains a hash of css attributes. :rtype: dict """ # remove @import sentences data += ";" importIndex = data.find("@import") while importIndex != -1: data = data[0:importIndex] + data[data.find(";", importIndex) + 1 :] importIndex = data.find("@import") # parse the css. reverted from dictionary comprehension in order to # support older pythons pairs = [x.split("{") for x in data.split("}") if "{" in x.strip()] try: elements = {a.strip(): dumb_property_dict(b) for a, b in pairs} except ValueError: elements = {} # not that important return elements def element_style( attrs: Dict[str, Optional[str]], style_def: Dict[str, Dict[str, str]], parent_style: Dict[str, str], ) -> Dict[str, str]: """ :type attrs: dict :type style_def: dict :type style_def: dict :returns: A hash of the 'final' style attributes of the element :rtype: dict """ style = parent_style.copy() if "class" in attrs: assert attrs["class"] is not None for css_class in attrs["class"].split(): css_style = style_def.get("." + css_class, {}) style.update(css_style) if "style" in attrs: assert attrs["style"] is not None immediate_style = dumb_property_dict(attrs["style"]) style.update(immediate_style) return style def google_list_style(style: Dict[str, str]) -> str: """ Finds out whether this is an ordered or unordered list :type style: dict :rtype: str """ if "list-style-type" in style: list_style = style["list-style-type"] if list_style in ["disc", "circle", "square", "none"]: return "ul" return "ol" def google_has_height(style: Dict[str, str]) -> bool: """ Check if the style of the element has the 'height' attribute explicitly defined :type style: dict :rtype: bool """ return "height" in style def google_text_emphasis(style: Dict[str, str]) -> List[str]: """ :type style: dict :returns: A list of all emphasis modifiers of the element :rtype: list """ emphasis = [] if "text-decoration" in style: emphasis.append(style["text-decoration"]) if "font-style" in style: emphasis.append(style["font-style"]) if "font-weight" in style: emphasis.append(style["font-weight"]) return emphasis def google_fixed_width_font(style: Dict[str, str]) -> bool: """ Check if the css of the current element defines a fixed width font :type style: dict :rtype: bool """ font_family = "" if "font-family" in style: font_family = style["font-family"] return "courier new" == font_family or "consolas" == font_family def list_numbering_start(attrs: Dict[str, Optional[str]]) -> int: """ Extract numbering from list element attributes :type attrs: dict :rtype: int or None """ if "start" in attrs: assert attrs["start"] is not None try: return int(attrs["start"]) - 1 except ValueError: pass return 0 def skipwrap(para: str, wrap_links: bool, wrap_list_items: bool) -> bool: # If it appears to contain a link # don't wrap if not wrap_links and config.RE_LINK.search(para): return True # If the text begins with four spaces or one tab, it's a code block; # don't wrap if para[0:4] == " " or para[0] == "\t": return True # If the text begins with only two "--", possibly preceded by # whitespace, that's an emdash; so wrap. stripped = para.lstrip() if stripped[0:2] == "--" and len(stripped) > 2 and stripped[2] != "-": return False # I'm not sure what this is for; I thought it was to detect lists, # but there's a <br>-inside-<span> case in one of the tests that # also depends upon it. if stripped[0:1] in ("-", "*") and not stripped[0:2] == "**": return not wrap_list_items # If the text begins with a single -, *, or +, followed by a space, # or an integer, followed by a ., followed by a space (in either # case optionally proceeded by whitespace), it's a list; don't wrap. return bool( config.RE_ORDERED_LIST_MATCHER.match(stripped) or config.RE_UNORDERED_LIST_MATCHER.match(stripped) ) def escape_md(text: str) -> str: """ Escapes markdown-sensitive characters within other markdown constructs. """ return config.RE_MD_CHARS_MATCHER.sub(r"\\\1", text) def escape_md_section(text: str, snob: bool = False) -> str: """ Escapes markdown-sensitive characters across whole document sections. """ text = config.RE_MD_BACKSLASH_MATCHER.sub(r"\\\1", text) if snob: text = config.RE_MD_CHARS_MATCHER_ALL.sub(r"\\\1", text) text = config.RE_MD_DOT_MATCHER.sub(r"\1\\\2", text) text = config.RE_MD_PLUS_MATCHER.sub(r"\1\\\2", text) text = config.RE_MD_DASH_MATCHER.sub(r"\1\\\2", text) return text def reformat_table(lines: List[str], right_margin: int) -> List[str]: """ Given the lines of a table padds the cells and returns the new lines """ # find the maximum width of the columns max_width = [len(x.rstrip()) + right_margin for x in lines[0].split("|")] max_cols = len(max_width) for line in lines: cols = [x.rstrip() for x in line.split("|")] num_cols = len(cols) # don't drop any data if colspan attributes result in unequal lengths if num_cols < max_cols: cols += [""] * (max_cols - num_cols) elif max_cols < num_cols: max_width += [len(x) + right_margin for x in cols[-(num_cols - max_cols) :]] max_cols = num_cols max_width = [ max(len(x) + right_margin, old_len) for x, old_len in zip(cols, max_width) ] # reformat new_lines = [] for line in lines: cols = [x.rstrip() for x in line.split("|")] if set(line.strip()) == set("-|"): filler = "-" new_cols = [ x.rstrip() + (filler * (M - len(x.rstrip()))) for x, M in zip(cols, max_width) ] else: filler = " " new_cols = [ x.rstrip() + (filler * (M - len(x.rstrip()))) for x, M in zip(cols, max_width) ] new_lines.append("|".join(new_cols)) return new_lines def pad_tables_in_text(text: str, right_margin: int = 1) -> str: """ Provide padding for tables in the text """ lines = text.split("\n") table_buffer = [] # type: List[str] table_started = False new_lines = [] for line in lines: # Toggle table started if config.TABLE_MARKER_FOR_PAD in line: table_started = not table_started if not table_started: table = reformat_table(table_buffer, right_margin) new_lines.extend(table) table_buffer = [] new_lines.append("") continue # Process lines if table_started: table_buffer.append(line) else: new_lines.append(line) return "\n".join(new_lines)