"""
ScrapeMed's Markup Language Cleaning Utilities
===============================================
Scrapemed module for markup language cleaning utilities.
.. warns::
unexpectedTagWarning
Warned when an unexpected tag enclosed in angle brackets is found.
"""
import warnings
import re
from scrapemed.utils import basicBiMap
import scrapemed._morehtml as mhtml
# monkeypatch warnings.formatwarning for cleaner warnings
warnings.formatwarning = (
lambda msg, category, *args, **kwargs: f"{category.__name__}: {msg}\n\n"
)
[docs]
class unexpectedTagWarning(Warning):
"""
Warned when an unexpected tag enclosed in angle brackets is found.
"""
pass
[docs]
def clean_xml_string(xml_string: str, strip_text_styling=True, verbose=False):
"""
Clean an XML string.
:param str xml_string: The XML string to be cleaned.
:param bool strip_text_styling: Whether to remove or replace HTML text styling tags.
:param bool verbose: Whether to print verbose output.
:return: The cleaned XML string.
:rtype: str
"""
# Strip html styling if requested
if strip_text_styling:
xml_string = _remove_text_styling(xml_string, verbose=verbose)
return xml_string
def _remove_text_styling(text: str, verbose=False) -> str:
"""
Remove specified HTML stylings from the provided text.
This function removes specified HTML stylings from the input text. It can
remove opening tags and their corresponding closing tags and replace
specific opening tags with desired values.
**Specifically:**
This overloaded wrapper function removes italic, bold, underline HTML text
styling tags from the input text. Additionally, it replaces <sub> with "_"
and <sup> with "^". <ext-link> is replaced with "[External URI:]".
:param str text: The text containing HTML stylings to be removed or replaced.
:param list[str] removals: A list of opening tags to be removed. Their
corresponding closing tags will also be removed. Tags will be removed
regardless of attributes.
:param dict replaces: A dictionary of find, replace values. The find values
should be HTML opening tags. They will be matched regardless of attributes.
:param bool verbose: Whether to print verbose output.
:return: The XML string with default HTML text styling tags (`REMOVALS`, `REPLACES`)
removed or replaced.
:rtype: str
"""
# remove italic, bold, underline styling
REMOVALS = ["<italic>", "<i>", "<bold>", "<b>", "<underline>", "<u>"]
REPLACES = {"<sub>": "_", "<sup>": "^", "<ext-link>": "[External URI:]"}
return _remove_html_styling(
text, removals=REMOVALS, replaces=REPLACES, verbose=verbose
)
def _remove_html_styling(
text: str, removals: list[str], replaces: dict, verbose=False
) -> str:
"""
Remove specified HTML stylings from the provided text.
:param str text: The text containing HTML stylings to be removed.
:param list[str] removals: A list of opening tags to be removed. Their
corresponding closing tags will also be removed. Tags will be removed
regardless of attributes.
:param dict replaces: A dictionary of find, replace values. The find values
should be HTML opening tags. They will be matched regardless of attributes.
:param bool verbose: Whether to print verbose output.
:return: The XML string with specified HTML text styling tags removed.
:rtype: str
"""
# ADD IN CLOSING TAGS FOR REMOVAL TAGS
to_remove = removals.copy()
more_to_remove = []
for tag in to_remove:
more_to_remove.append(tag[0] + "/" + tag[1:])
to_remove.extend(more_to_remove)
# ADD IN CLOSING TAGS FOR REPLACEMENT TAGS
to_replace_basic = replaces.copy()
for tag in to_replace_basic.keys():
to_remove.append(tag[0] + "/" + tag[1:])
# MATCH REGARDLESS OF HTML ATTRIBUTES
# Sample of what removals should look like for tag matching
# regardless of attributes
# /<head\b[^>]*>/i
for i in range(len(to_remove)):
to_remove[i] = to_remove[i][0:-1] + "\\b[^>]*" + to_remove[i][-1]
to_replace = {}
for find, replace in to_replace_basic.items():
new_find = find[0:-1] + "\\b[^>]*" + find[-1]
to_replace[new_find] = replace
# REPORT REQUESTED BEHAVIOR AT RUNTIME
if verbose:
print(f"Removing the following tags:\n{to_remove}\n")
print("Making the following replacements:\n")
for find, replace in to_replace.items():
print(f"{find} replaced with {replace}\n")
# REMOVALS
removal_pattern = "|".join(to_remove)
r = re.compile(removal_pattern, re.IGNORECASE)
text = r.sub("", text)
# REPLACEMENTS
for find, replace in to_replace.items():
text = re.sub(find, replace, text)
# RETURN THE CLEANED TEXT
return text
[docs]
def split_text_and_refs(
tree_text: str, ref_map: basicBiMap, id=None, on_unknown="keep"
):
"""
Split HTML tags out of text.
- HTML text styling tags will be removed if they aren't already.
- <xref>, <table-wrap>, and <fig> tags will be converted to MHTML tags containing
the key to use when searching for these references, tables, and figures.
Returns the cleaned text and updates the passed BiMap for any new key-tag
pairs found.
:param str tree_text: A string representing a markup language tree containing
HTML tags.
:param ref_map: A BiMap containing keys connected to reference tag values. BiMap
forward keys should be reference keys to place into the text in lieu of the
tag for later BiMap table lookup. BiMap forward values should be the
actual tags. The provided BiMap will be modified to reflect any new tag
values found, and keys will be appended as necessary.
:type ref_map: basicBiMap
:param id: Optionally provide an id for traceback of any issues.
:type id: Any, optional
:param str on_unknown: Behavior when encountering an unknown tag. Determines
what happens to the tag contents.
Default is 'keep'. Options: ['drop', 'keep']
:return: A tuple containing the cleaned text and the updated BiMap.
:rtype: Tuple[str, basicBiMap]
"""
XREF_TAG_NAME = "xref"
FIGURE_TAG_NAME = "fig"
TABLEWRAP_TAG_NAME = "table-wrap"
ALLOWED_TAG_NAMES = [XREF_TAG_NAME, FIGURE_TAG_NAME, TABLEWRAP_TAG_NAME]
# regex pattern string to match tags through to closing tag or self closing
# should match any HTML or XML tag
XML_HTML_TAG_PATTERN = (
r"<([a-zA-Z][\w-]*)\b[^>]*>(.*?)</\1>|<([a-zA-Z][\w-]*)\b[^/>]*/?>"
)
tag_r = re.compile(
XML_HTML_TAG_PATTERN, re.DOTALL
) # DOTALL used in case of multiline tag spans
text = tree_text.strip()
text = _remove_text_styling(text)
cleaned_text = ""
while len(text) > 0:
match = tag_r.search(text)
if match:
# found a tag, append the text prior to the tag
# and deal w tag
# EAT NEXT TAG AND MATCH PARTS
tag_name = match.group(1)
tag_contents = match.group(2)
full_tag = match.group()
# ADD CONTENTS PRIOR TO TAG
tag_start_index = match.start()
cleaned_text += text[0:tag_start_index]
# UNKNOWN TAG PROCESSING, WARN AND PERFORM SPECIFIED BEHAVIOR
if tag_name not in ALLOWED_TAG_NAMES:
warning_msg = (
f"Tag of type {tag_name} found in a text portion of "
"the provided markup language. "
"Expected only HTML styling tags, or tags from the "
f"following list: {ALLOWED_TAG_NAMES}."
f" Specified unknown tag behavior: {on_unknown}."
)
if id:
warning_msg += (
" Warning occured in a text section " f"with id: {id}."
)
warnings.warn(warning_msg, unexpectedTagWarning)
if on_unknown == "keep":
cleaned_text += tag_contents
# eat through the text that was just processed
text = text[match.end() :]
# KNOWN TAG PROCESSING, UPDATE DATA REF
else:
# add tag contents if it is an xref.
if tag_name == "xref":
cleaned_text += tag_contents
# Get reference number for data reference
ref_num = None
if full_tag in ref_map.reverse:
# have we generated a map for this tag before?
ref_num = ref_map.reverse[full_tag]
else:
ref_num = len(ref_map) # new tag, append a new key
ref_map[ref_num] = full_tag # and fill in the tag value
data_ref_tag = mhtml.generate_typed_mhtml_tag(
tag_type="dataref", string=str(ref_num)
)
cleaned_text += f"{data_ref_tag}"
# eat through the text that was just processed
text = text[match.end() :]
else:
# no more tags to deal with, add the last bits to our output text
cleaned_text += text
text = ""
return cleaned_text