Source code for scrapemed._validate

"""
ScrapeMed's ``_validate`` Module
=================================

Validation module for determining whether XML conforms to a format
supported by the scrapemed package (NLM Articleset 2.0 DTD).

**Custom Exception**:
    - ``noDTDFoundError``: Raised when no DTD specification can be found in the
        downloaded XML.
"""

import re
import lxml.etree as ET
import os
from io import StringIO
from scrapemed.utils import cleanerdoc

SUPPORTED_DTD_URLS = [
    "https://dtd.nlm.nih.gov/ncbi/pmc/articleset/nlm-articleset-2.0.dtd"
]
# Regex DTD URL Patterns
DTD_URL_PATTERN = re.compile(r'"(https?://\S+)"')
END_OF_URL_PATTERN = re.compile(r"[^/]+$")



[docs]
class noDTDFoundError(Exception):
    """
    Raised when no DTD specification can be found in a downloaded XML,
    preventing validation.
    """

    pass



# ---------------------------DATA VALIDATION-------------------------------

[docs]
def validate_xml(xml: ET.ElementTree) -> bool:
    """
    Validate an XML ElementTree against a supported Document Type Definition
    (DTD).

    This function validates the provided XML ElementTree against a supported
    DTD (Document Type Definition). The supported DTDs are defined by the files
    in the 'scrapemed/data/DTDs' directory. Currently only NLM Articleset 2.0
    (The DTD used by PubMed Central) is supported.

    :param ET.ElementTree xml: An XML ElementTree to be validated.

    :return: True if the XML is validated successfully against a supported DTD,
        False otherwise.
    :rtype: bool

    :raises noDTDFoundError: If no DTD is specified for validation in the
        XML doctype.
    """
    # Find DTD and confirm its supported
    match = DTD_URL_PATTERN.search(xml.docinfo.doctype)
    url = None
    if match:
        url = match.group(1)
        assert url in SUPPORTED_DTD_URLS
    else:
        raise noDTDFoundError(
            cleanerdoc(
                """A DTD must be specified for validation. Set
                       validate=false if you want to proceeed without
                       validation."""
            )
        )

    match = END_OF_URL_PATTERN.search(url)
    dtd_filename = match.group(0)
    dtd_filepath = os.path.join(
        os.path.dirname(os.path.abspath(__file__)), "data", "DTDs", dtd_filename
    )

    dtd_doc = None
    with open(dtd_filepath, "r") as f:
        dtd_doc = f.read()
    if dtd_doc is None:
        raise noDTDFoundError(
            cleanerdoc(
                """DTD not found in scrapemed package. Ensure you are using the
                latest package version."""
            )
        )

    dtd = ET.DTD(StringIO(dtd_doc))

    return dtd.validate(xml)



# -------------------------END DATA VALIDATION-------------------------------