Source code for scrapemed._validate

"""
ScrapeMed's ``_validate`` Module
=================================

Validation module for determining whether XML conforms to a format
supported by the scrapemed package (NLM Articleset 2.0 DTD).

**Custom Exception**:
    - ``noDTDFoundError``: Raised when no DTD specification can be found in the
        downloaded XML.
"""

import re
import lxml.etree as ET
import os
from io import StringIO
from scrapemed.utils import cleanerdoc

SUPPORTED_DTD_URLS = [
    "https://dtd.nlm.nih.gov/ncbi/pmc/articleset/nlm-articleset-2.0.dtd"
]
# Regex DTD URL Patterns
DTD_URL_PATTERN = re.compile(r'"(https?://\S+)"')
END_OF_URL_PATTERN = re.compile(r"[^/]+$")


[docs] class noDTDFoundError(Exception): """ Raised when no DTD specification can be found in a downloaded XML, preventing validation. """ pass
# ---------------------------DATA VALIDATION-------------------------------
[docs] def validate_xml(xml: ET.ElementTree) -> bool: """ Validate an XML ElementTree against a supported Document Type Definition (DTD). This function validates the provided XML ElementTree against a supported DTD (Document Type Definition). The supported DTDs are defined by the files in the 'scrapemed/data/DTDs' directory. Currently only NLM Articleset 2.0 (The DTD used by PubMed Central) is supported. :param ET.ElementTree xml: An XML ElementTree to be validated. :return: True if the XML is validated successfully against a supported DTD, False otherwise. :rtype: bool :raises noDTDFoundError: If no DTD is specified for validation in the XML doctype. """ # Find DTD and confirm its supported match = DTD_URL_PATTERN.search(xml.docinfo.doctype) url = None if match: url = match.group(1) assert url in SUPPORTED_DTD_URLS else: raise noDTDFoundError( cleanerdoc( """A DTD must be specified for validation. Set validate=false if you want to proceeed without validation.""" ) ) match = END_OF_URL_PATTERN.search(url) dtd_filename = match.group(0) dtd_filepath = os.path.join( os.path.dirname(os.path.abspath(__file__)), "data", "DTDs", dtd_filename ) dtd_doc = None with open(dtd_filepath, "r") as f: dtd_doc = f.read() if dtd_doc is None: raise noDTDFoundError( cleanerdoc( """DTD not found in scrapemed package. Ensure you are using the latest package version.""" ) ) dtd = ET.DTD(StringIO(dtd_doc)) return dtd.validate(xml)
# -------------------------END DATA VALIDATION-------------------------------