Source code for scrapemed.scrape

"""
ScrapeMed's Scrape Module
============================

ScrapeMed's `scrape` module handles PubMed Central data searching
and downloads.

This module also handles conversion of raw XML data to
lxml.etree.ElementTree objects.

..warnings::
    - :class:`validationWarning` - Warned when downloading PMC XML without
        validating.
"""


import scrapemed._clean as _clean
import scrapemed._validate as _validate
import lxml.etree as ET
from Bio import Entrez
import warnings
from typing import List


[docs] class validationWarning(Warning): """ Warned when downloading PMC XML without validating. """ pass
# ---------------------Download Funcs for PubMed Central-----------------------
[docs] def search_pmc(email: str, term: str, retmax: int = 10, verbose: bool = False) -> dict: """ Wrapper for Bio.Entrez's esearch function to retrieve PMC search results. :param str email: Use your email to authenticate with PMC. :param str term: The search term. :param int retmax: The maximum number of PMCIDs to return. Default is 10. :param bool verbose: Whether to display verbose output. Default is False. :return: A dictionary containing search results, including PMCIDs. :rtype: dict """ DB = "pmc" Entrez.email = email handle = Entrez.esearch(db=DB, retmax=retmax, term=term, idtype="pmc") record = Entrez.read(handle) handle.close() if verbose: print(f"\nSearching {DB}...\n") print(f"Number of results found: {record['Count']}") return record
[docs] def get_xmls( pmcids: List[int], email: str, download=False, validate=True, strip_text_styling=True, verbose=False, ) -> List[ET.ElementTree]: """ Retrieve XMLs of research papers from PMC, given a list of PMCIDs. Also validates and cleans the XMLs by default. :param List[int] pmcids: List of PMCIDs of articles to retrieve. :param str email: Use your email to authenticate with PMC. :param bool download: Whether or not to download the XMLs. Default is False. :param bool validate: Whether or not to validate the retrieved XMLs (HIGHLY RECOMMENDED). Default is True. :param bool strip_text_styling: Whether or not to clean common HTML text styling from the text (HIGHLY RECOMMENDED). Default is True. :param bool verbose: Whether to display verbose output. Default is False. :return: List of ElementTrees of the XMLs corresponding to the provided PMCIDs. :rtype: List[ET.ElementTree] """ return [ get_xml(pmcid, email, download, validate, strip_text_styling, verbose) for pmcid in pmcids ]
[docs] def get_xml( pmcid: int, email: str, download=False, validate=True, strip_text_styling=True, verbose=False, ) -> ET.ElementTree: """ Retrieve XML of a research paper from PMC, given a PMCID. Also validates and cleans the XML by default. :param int pmcid: PMCID of the article to retrieve. :param str email: Use your email to authenticate with PMC. :param bool download: Whether or not to download the XML. Default is False. :param bool validate: Whether or not to validate the retrieved XML (HIGHLY RECOMMENDED). Default is True. :param bool strip_text_styling: Whether or not to clean common HTML text styling from the text (HIGHLY RECOMMENDED). Default is True. :param bool verbose: Whether to display verbose output. Default is False. :return: ElementTree of the validated XML record. :rtype: ET.ElementTree """ xml_text = _get_xml_string(pmcid, email, download, verbose) tree = xml_tree_from_string( xml_string=xml_text, strip_text_styling=strip_text_styling, verbose=verbose ) if validate: # Validate tags, attrs, values are supported for # parsing by the scrapemed package. _validate.validate_xml(tree) else: warnings.warn( ( f"Warning! Scraping XML for PMCID {pmcid} from " "PMC without validating." ), validationWarning, ) return tree
def _get_xml_string(pmcid: int, email: str, download=False, verbose=False) -> str: """ Retrieve XML text of a research paper from PMC. :param int pmcid: PMCID of the article to retrieve. :param str email: Email of the user requesting data from PMC. :param bool download: Whether or not to download the XML. Default is False. :param bool verbose: Whether to display verbose output. Default is False. :return: XML Text of the record. :rtype: str WARNING: THIS FUNCTION DOES NOT VALIDATE THE XML. """ DB = "pmc" RETTYPE = "full" RETMODE = "xml" Entrez.email = email # Actually fetch from PMC handle = Entrez.efetch(db=DB, id=pmcid, rettype=RETTYPE, retmode=RETMODE) xml_record = handle.read() xml_text = xml_record.decode(encoding="utf-8") handle.close() if verbose: print(f"\nGetting {RETMODE.upper()} string from {DB}...\n") print(f"XML Record First 100 bytes: {xml_record[0:100]}") print(f"XML Text First 100 Chars: {xml_text[0:100]}") if download: with open(f"data/entrez_download_PMCID={pmcid}.{RETMODE}", "w+") as f: f.write(xml_text) return xml_text # ----------------End Download Funcs for PubMed Central--------------------- # --------------------Convert XML strings -> Trees---------------------
[docs] def xml_tree_from_string( xml_string: str, strip_text_styling, verbose=False ) -> ET.ElementTree: """ Converts a string representing XML to an lxml ElementTree. :param str xml_string: A string or bytestream representing XML. :param bool strip_text_styling: Whether to remove HTML text styling tags or not. :param bool verbose: Whether to display verbose output. Default is False. :return: An lxml.etree.ElementTree of the passed string. :rtype: ET.ElementTree """ xml_string = _clean.clean_xml_string(xml_string, strip_text_styling) tree = ET.ElementTree(ET.fromstring(xml_string)) return tree
# --------------------End Convert XML strings -> Trees---------------------