"""
ScrapeMed's Scrape Module
============================
ScrapeMed's `scrape` module handles PubMed Central data searching
and downloads.
This module also handles conversion of raw XML data to
lxml.etree.ElementTree objects.
..warnings::
- :class:`validationWarning` - Warned when downloading PMC XML without
validating.
"""
import scrapemed._clean as _clean
import scrapemed._validate as _validate
import lxml.etree as ET
from Bio import Entrez
import warnings
from typing import List
[docs]
class validationWarning(Warning):
"""
Warned when downloading PMC XML without validating.
"""
pass
# ---------------------Download Funcs for PubMed Central-----------------------
[docs]
def search_pmc(email: str, term: str, retmax: int = 10, verbose: bool = False) -> dict:
"""
Wrapper for Bio.Entrez's esearch function to retrieve PMC search results.
:param str email: Use your email to authenticate with PMC.
:param str term: The search term.
:param int retmax: The maximum number of PMCIDs to return. Default is 10.
:param bool verbose: Whether to display verbose output. Default is False.
:return: A dictionary containing search results, including PMCIDs.
:rtype: dict
"""
DB = "pmc"
Entrez.email = email
handle = Entrez.esearch(db=DB, retmax=retmax, term=term, idtype="pmc")
record = Entrez.read(handle)
handle.close()
if verbose:
print(f"\nSearching {DB}...\n")
print(f"Number of results found: {record['Count']}")
return record
[docs]
def get_xmls(
pmcids: List[int],
email: str,
download=False,
validate=True,
strip_text_styling=True,
verbose=False,
) -> List[ET.ElementTree]:
"""
Retrieve XMLs of research papers from PMC, given a list of PMCIDs.
Also validates and cleans the XMLs by default.
:param List[int] pmcids: List of PMCIDs of articles to retrieve.
:param str email: Use your email to authenticate with PMC.
:param bool download: Whether or not to download the XMLs. Default is False.
:param bool validate: Whether or not to validate the retrieved XMLs
(HIGHLY RECOMMENDED). Default is True.
:param bool strip_text_styling: Whether or not to clean common HTML text
styling from the text (HIGHLY RECOMMENDED). Default is True.
:param bool verbose: Whether to display verbose output. Default is False.
:return: List of ElementTrees of the XMLs corresponding to
the provided PMCIDs.
:rtype: List[ET.ElementTree]
"""
return [
get_xml(pmcid, email, download, validate, strip_text_styling, verbose)
for pmcid in pmcids
]
[docs]
def get_xml(
pmcid: int,
email: str,
download=False,
validate=True,
strip_text_styling=True,
verbose=False,
) -> ET.ElementTree:
"""
Retrieve XML of a research paper from PMC, given a PMCID.
Also validates and cleans the XML by default.
:param int pmcid: PMCID of the article to retrieve.
:param str email: Use your email to authenticate with PMC.
:param bool download: Whether or not to download the XML. Default is False.
:param bool validate: Whether or not to validate the retrieved XML
(HIGHLY RECOMMENDED). Default is True.
:param bool strip_text_styling: Whether or not to clean common HTML
text styling from the text (HIGHLY RECOMMENDED). Default is True.
:param bool verbose: Whether to display verbose output. Default is False.
:return: ElementTree of the validated XML record.
:rtype: ET.ElementTree
"""
xml_text = _get_xml_string(pmcid, email, download, verbose)
tree = xml_tree_from_string(
xml_string=xml_text, strip_text_styling=strip_text_styling, verbose=verbose
)
if validate:
# Validate tags, attrs, values are supported for
# parsing by the scrapemed package.
_validate.validate_xml(tree)
else:
warnings.warn(
(
f"Warning! Scraping XML for PMCID {pmcid} from "
"PMC without validating."
),
validationWarning,
)
return tree
def _get_xml_string(pmcid: int, email: str, download=False, verbose=False) -> str:
"""
Retrieve XML text of a research paper from PMC.
:param int pmcid: PMCID of the article to retrieve.
:param str email: Email of the user requesting data from PMC.
:param bool download: Whether or not to download the XML. Default is False.
:param bool verbose: Whether to display verbose output. Default is False.
:return: XML Text of the record.
:rtype: str
WARNING: THIS FUNCTION DOES NOT VALIDATE THE XML.
"""
DB = "pmc"
RETTYPE = "full"
RETMODE = "xml"
Entrez.email = email
# Actually fetch from PMC
handle = Entrez.efetch(db=DB, id=pmcid, rettype=RETTYPE, retmode=RETMODE)
xml_record = handle.read()
xml_text = xml_record.decode(encoding="utf-8")
handle.close()
if verbose:
print(f"\nGetting {RETMODE.upper()} string from {DB}...\n")
print(f"XML Record First 100 bytes: {xml_record[0:100]}")
print(f"XML Text First 100 Chars: {xml_text[0:100]}")
if download:
with open(f"data/entrez_download_PMCID={pmcid}.{RETMODE}", "w+") as f:
f.write(xml_text)
return xml_text
# ----------------End Download Funcs for PubMed Central---------------------
# --------------------Convert XML strings -> Trees---------------------
[docs]
def xml_tree_from_string(
xml_string: str, strip_text_styling, verbose=False
) -> ET.ElementTree:
"""
Converts a string representing XML to an lxml ElementTree.
:param str xml_string: A string or bytestream representing XML.
:param bool strip_text_styling: Whether to remove HTML text styling tags or not.
:param bool verbose: Whether to display verbose output. Default is False.
:return: An lxml.etree.ElementTree of the passed string.
:rtype: ET.ElementTree
"""
xml_string = _clean.clean_xml_string(xml_string, strip_text_styling)
tree = ET.ElementTree(ET.fromstring(xml_string))
return tree
# --------------------End Convert XML strings -> Trees---------------------