Source code for scrapemed._parse

"""
ScrapeMed's ``_parse`` Module
=============================================

Parse module for grabbing metadata, text, tables, figures, etc.
from XML trees representing PMC articles.

DTD for the XML should be NLM articleset 2.0.
Otherwise the behavior here may not be as expected.

Middleman between the `scrape` module and the `paper` module for ScrapeMed.

..warnings::
    - :class:`unexpectedMultipleMatchWarning` - Warned when one match is
        expected, but multiple are found.
    - :class:`unexpectedZeroMatchWarning` - Warned when one or more matches are
        expected, and none are found.
    - :class:`badTextFormattingWarning` - Warned when there are issues with text
        formatting.
    - :class:`unmatchedCitationWarning` - Warned when a citation reference is
        made but not matched to an actual <ref> tag.
    - :class:`unmatchedTableWarning` - Warned when a table reference is made
        but not matched to an actual <table-wrap> tag.
    - :class:`unmatchedFigureWarning` - Warned when a figure reference is made
        but not matched to an actual <fig> tag.
"""

import copy
from typing import List, Dict, Tuple, Set
from typing import Union
import scrapemed.scrape as scrape
import lxml.etree as ET
from scrapemed.utils import basicBiMap, cleanerdoc
from scrapemed._text import TextParagraph, TextSection, TextTable, TextFigure
from datetime import datetime
import pandas as pd
import warnings
import textwrap
import uuid


# -----------Custom Warnings & Exceptions for Parsing------------

[docs]
class unexpectedMultipleMatchWarning(Warning):
    """
    Warned when one match expected, but multiple found.
    """

    pass




[docs]
class unexpectedZeroMatchWarning(Warning):
    """
    Warned when one or more matches expected, and none are found.
    """

    pass




[docs]
class badTextFormattingWarning(Warning):
    pass




[docs]
class unmatchedCitationWarning(Warning):
    """
    Warned when a citation reference is made but not matched to an
    actual <ref> tag.
    """

    pass




[docs]
class unmatchedTableWarning(Warning):
    """
    Warned when a table reference is made but not matched to an
    actual <table-wrap> tag.
    """

    pass




[docs]
class unmatchedFigureWarning(Warning):
    """
    Warned when a figure reference is made but not matched
    to an actual <fig> tag.
    """

    pass



# -----------End Custom Warnings & Exceptions for Parsing------------


# ------------------GENERATE PAPER DICTIONARY GIVEN PMCID--------------------

[docs]
def paper_dict_from_pmc(
    pmcid: int,
    email: str,
    download: bool = False,
    validate: bool = True,
    verbose: bool = False,
    suppress_warnings: bool = False,
    suppress_errors: bool = False,
) -> dict:
    """
    Wrapper that scrapes a PMC article specified by PMCID from the web,
    then parses the retrieved XML into a dictionary of useful values.

    This function serves as a middleman between the `scrape.py` module and
    `Paper.from_pmc` method in `paper.py`, facilitating the conversion of
    PMC XML data to a dictionary.

    :param int pmcid: Unique PMCID for the article to scrape and parse.
    :param str email: Provide your email address for authentication with PMC.
    :param bool download: Whether or not to download the XML retrieved from PMC.
    :param bool validate: Whether or not to validate the XML from PMC against
        NLM articleset 2.0 DTD (HIGHLY RECOMMENDED).
    :param bool verbose: Whether or not to have verbose output for debugging.
    :param bool suppress_warnings: Whether to suppress warnings while parsing XML.
        Note: Warnings are frequent, given the variable nature of PMC XML data.
        Recommended to suppress when parsing many XMLs at once.
    :param bool suppress_errors: Return None on failed XML parsing, instead of
        raising an error.

    :return: A dictionary containing useful values parsed from the PMC article.
    :rtype: dict
    """

    if verbose:
        print(f"Generating Paper object for PMCID = {pmcid}...")
    # DOWNLOAD XML TREE AND GET ROOT
    paper_tree = scrape.get_xml(
        pmcid=pmcid, email=email, download=download, validate=validate, verbose=verbose
    )
    root = paper_tree.getroot()

    return generate_paper_dict(
        pmcid=pmcid,
        paper_root=root,
        verbose=verbose,
        suppress_warnings=suppress_warnings,
        suppress_errors=suppress_errors,
    )




[docs]
def generate_paper_dict(
    pmcid: int,
    paper_root: ET.Element,
    verbose: bool = False,
    suppress_warnings: bool = False,
    suppress_errors: bool = False,
) -> dict:
    """
    Given the root of an XML tree, parse through it and generate
    a flattened dictionary of relevant PMC paper XML information.

    This function expects the XML to be in NLM articleset 2.0 DTD format.

    Optionally, you can suppress warnings and/or errors. If errors are suppressed,
    None will be returned upon failed parsing.

    :param int pmcid: Unique PMCID for the article being parsed.
    :param ET.Element paper_root: The root element of the PMC paper XML tree.
    :param bool verbose: Whether or not to have verbose output for debugging.
    :param bool suppress_warnings: Whether to suppress warnings while parsing XML.
        Note: Warnings are frequent due to the variable nature of PMC XML data.
        Recommended to suppress when parsing many XMLs at once.
    :param bool suppress_errors: Whether to suppress errors during parsing.
        If suppressed, None will be returned upon a failed parsing attempt.

    :return: A flattened dictionary containing relevant PMC paper XML information.
    :rtype: dict or None if errors are suppressed and parsing fails.
    """

    paper_dict = None

    if suppress_warnings:
        warnings.simplefilter("ignore")

    if suppress_errors:
        try:
            paper_dict = _actually_generate_paper_dict(pmcid, paper_root, verbose)
        except Exception as e:
            print(f"An exception occurred: {str(e)}")
    else:
        paper_dict = _actually_generate_paper_dict(pmcid, paper_root, verbose)

    if suppress_warnings:
        warnings.simplefilter("default")

    return paper_dict



def _actually_generate_paper_dict(
    pmcid: int, paper_root: ET.Element, verbose: bool = False
) -> dict:
    """
    Actual paper dictionary generation function.

    This function is called by the wrapper function `generate_paper_dict()`.
    Unlike the wrapper function, this function does not have error or warning
        suppression.

    :param int pmcid: Unique PMCID for the article being parsed.
    :param ET.Element paper_root: The root element of the PMC paper XML tree.
    :param bool verbose: Whether or not to have verbose output for debugging.

    :return: A dictionary containing relevant PMC paper XML information.
    :rtype: dict
    """
    root = paper_root
    # KEEP TRACK OF XREFS, TABLES, FIGURES IN BIMAP
    # (THIS WILL BE UPDATED DURING TEXT RETRIEVAL
    # WHEN HTML REF TAGS ARE SPLIT OUT)
    ref_map = basicBiMap()

    # STORE EXTRACTED INFO IN PAPER DICT
    paper_dict = {
        "PMCID": pmcid,
        "Title": gather_title(root),
        "Authors": gather_authors(root),
        "Non-Author Contributors": gather_non_author_contributors(root),
        "Abstract": gather_abstract(root, ref_map),
        "Body": gather_body(root, ref_map),
        "Journal ID": gather_journal_id(root),
        "Journal Title": gather_journal_title(root),
        "ISSN": gather_issn(root),
        "Publisher Name": gather_publisher_name(root),
        "Publisher Location": gather_publisher_location(root),
        "Article ID": gather_article_id(root),
        "Article Types": gather_article_types(root),
        "Article Categories": gather_article_categories(root),
        "Published Date": gather_published_date(root),
        "Volume": gather_volume(root),
        "Issue": gather_issue(root),
        "First Page": gather_fpage(root),
        "Last Page": gather_lpage(root),
        "Permissions": gather_permissions(root),
        "Funding": gather_funding(root),
        "Footnote": gather_footnote(root),
        "Acknowledgements": gather_acknowledgements(root),
        "Notes": gather_notes(root),
        "Custom Meta": gather_custom_metadata(root),
        "Ref Map With Tags": copy.deepcopy(ref_map),
        "Ref Map": _clean_ref_map(paper_root=root, ref_map=ref_map),
    }

    citations, tables, figures = _split_citations_tables_figs(paper_dict["Ref Map"])
    paper_dict["Citations"] = citations
    paper_dict["Tables"] = tables
    paper_dict["Figures"] = figures

    if verbose:
        print(
            (
                "Finished generating Paper object for "
                f"PMCID = {paper_dict['PMCID']}..."
            )
        )

    return paper_dict



[docs]
def define_data_dict() -> dict:
    """
    Returns a static definition of each of the elements
    returned in a Paper dictionary.

    :return: A dictionary where keys are the elements in the Paper dictionary,
        and values are descriptions of those elements.
    :rtype: dict
    """
    data_dict = {
        "PMCID": "PMCID of the PMC article. Unique.",
        "Title": "Title of the PMC article.",
        "Authors": cleanerdoc(
            """Dataframe of the PMC Authors, including first
            names, last names, email addresses, and affiliations
            if provided."""
        ),
        "Non-Author Contributors": cleanerdoc(
            """Dataframe of the non-author contributors,
            including first names, last names, email addresses,
            and affiliations if provided."""
        ),
        "Abstract": cleanerdoc(
            """List of TextSections parsed from the abstract
            portion of the XML. Use Paper.print_abstract() or
            Paper.abstract_as_str() for a simple view of the text."""
        ),
        "Body": cleanerdoc(
            """List of TextSections parsed from the body portion of the XML.
            Use Paper.print_body() or Paper.body_as_str() for a simple view
            of the text."""
        ),
        "Journal ID": cleanerdoc(
            """Dict of ID Type, ID pairs for the Journal in which the article
            was published. ie. NLM-TA and ISO-ABBREV IDs."""
        ),
        "Journal Title": cleanerdoc("""Name of the journal in text."""),
        "ISSN": "Dict of ISSN type, ISSN number values for the article.",
        "Publisher Name": "Name of the publisher in text.",
        "Publisher Location": "Location of the publisher in text.",
        "Article ID": cleanerdoc(
            """Dict of ID Type, ID Value pairs. ie. p.article_id['pmc'] gives
            the PMCID for the article."""
        ),
        "Article Types": "List of 'header' article types for the article.",
        "Article Categories": "List of 'non-header' article types for the article.",
        "Published Date": cleanerdoc(
            """Dict of various publishing dates of the paper
            (ie: electronic pub, print pub)."""
        ),
        "Volume": cleanerdoc(
            """The Volume # in which this paper was
            published in its journal(s)."""
        ),
        "Issue": cleanerdoc(
            """The Issue # in which this paper was grouped within the
            volume of the journal(s) in which it is published."""
        ),
        "FPage": "First page on which this paper was published in its journal.",
        "LPage": "Last page on which this paper was published in its journal.",
        "Permissions": cleanerdoc(
            """Summary of copyright statement, license type,
            and full license text for the paper."""
        ),
        "Copyright Statement": cleanerdoc(
            """Returns the Copyright statement. Usually a short phrase
            identifying the individuals who have copyrighted this research,
            under a copyright license type found via paper.license."""
        ),
        "License": cleanerdoc(
            """Returns the License Type the research is licensed
             under (ie: Open Access)."""
        ),
        "Funding": cleanerdoc(
            """Returns a list of groups which funded the research.
              Important for bias detection."""
        ),
        "Footnote": "Text of any footnote statement provided with the article.",
        "Acknowledgements": cleanerdoc(
            """List of acknowledgement statements provided with
            the article."""
        ),
        "Notes": "List of notes included with the article.",
        "Custom Meta": cleanerdoc(
            """Dict of custom metadata key, value pairs
            provided with the article."""
        ),
        "Ref Map": cleanerdoc(
            """Dict of Index, Reference value pairs. Use p.ref_map to decode
            data references within TextSection.text_with_refs text. ie. When
            working with the full text with references, you may come across
            something like [MHTML::dataref::0]. This means that the reference
            under p.ref_map[0] was extracted from this location in the text.
            This can be useful for linking text with tables, figures, and
            xrefs for more detailed analysis."""
        ),
    }

    return data_dict




[docs]
def gather_title(root: ET.Element) -> str:
    """
    Extract the title of a PMC paper from its XML root.

    :param ET.Element root: The root element of the PMC paper XML tree.

    :return: The title of the PMC paper.
    :rtype: str
    """
    matches = root.xpath("//article-title/text()")
    if len(matches) > 1:
        warnings.warn(
            (
                "Warning! Multiple titles matched. Setting "
                "Paper.title to the first match."
            ),
            unexpectedMultipleMatchWarning,
        )
    elif len(matches) == 0:
        warnings.warn(
            "No article title found in the retrieved XML.", unexpectedZeroMatchWarning
        )
        return None
    title = matches[0]

    return title



def _get_contributor_tuples(
    root: ET.Element, contributors: List[ET.Element]
) -> List[Tuple]:
    """
    Helper function to retrieve tuples of contributor information.

    :param ET.Element root: The root of the XML tree to search.
    :param List[ET.Element] contributors: A list of lxml Element objects
        containing contributor information.

    :return: A list of tuples representing contributor information in the form
        (contrib_type, first_name, last_name, address, affiliations).
    :rtype: List[Tuple]
    """
    contributor_tuples = []
    for contributor in contributors:
        contrib_type = contributor.get("contrib-type").capitalize()
        if contrib_type:
            contrib_type = contrib_type.strip()
        first_name = contributor.findtext(".//given-names")
        if first_name is not None:
            first_name = first_name.strip()
        last_name = contributor.findtext(".//surname")
        if last_name is not None:
            last_name = last_name.strip()
        address = contributor.findtext(".//address/email")
        if address is not None:
            address = address.strip()
        affiliations = []
        aff_paths = contributor.xpath(".//xref[@ref-type='aff']")
        for aff in aff_paths:
            aff_id = aff.get("rid")
            aff_texts = root.xpath(
                (f"//contrib-group/aff[@id='{aff_id}']" "/text()[not(parent::label)]")
            )
            if len(aff_texts) > 1:
                warnings.warn(
                    (
                        "Multiple affiliations with the same ID found. "
                        "Check XML Formatting."
                    ),
                    unexpectedMultipleMatchWarning,
                )
            if len(aff_texts) == 0:
                aff_texts = ["Affiliation data not found."]

            institutions = root.xpath(
                (
                    f"//contrib-group/aff[@id='{aff_id}']"
                    "/institution-wrap/institution/text()"
                )
            )
            institutions = " ".join([str(inst) for inst in institutions])

            # Generate affiliation text
            affiliation = ""
            if institutions:
                affiliation = f"{aff_id.strip()}: {institutions}{aff_texts[0].strip()}"
            else:
                affiliation = f"{aff_id.strip()}: {aff_texts[0].strip()}"
            affiliations.append(affiliation)

        contributor_tuples.append(
            (contrib_type, first_name, last_name, address, affiliations)
        )
    return contributor_tuples



[docs]
def gather_authors(root: ET.Element) -> pd.DataFrame:
    """
    Extract authors, their emails, and affiliations from a PMC XML.

    :param ET.Element root: The root element of the PMC paper XML tree.

    :return: A DataFrame containing author information with columns:
             - Contributor_Type: Type of contributor (e.g., 'author').
             - First_Name: First name of the author.
             - Last_Name: Last name of the author.
             - Email_Address: Email address of the author.
             - Affiliations: Affiliations of the author.
    :rtype: pd.DataFrame
    """
    authors = root.xpath(".//contrib[@contrib-type='author']")
    if len(authors) == 0:
        warnings.warn(
            "Warning! Authors could not be matched", unexpectedZeroMatchWarning
        )
        return None

    # Extract the first and last names of the authors and store them in a list
    author_tuples = _get_contributor_tuples(root=root, contributors=authors)

    authors_df = pd.DataFrame(author_tuples)
    authors_df.columns = [
        "Contributor_Type",
        "First_Name",
        "Last_Name",
        "Email_Address",
        "Affiliations",
    ]

    return authors_df




[docs]
def gather_non_author_contributors(root: ET.Element) -> Union[str, pd.DataFrame]:
    """
    Extract non-author contributors from a PMC XML.

    :param ET.Element root: The root element of the PMC paper XML tree.

    :return: Either a string indicating that no non-author contributors were found,
             or a DataFrame containing contributor information with columns:
             - Contributor_Type: Type of contributor.
             - First_Name: First name of the contributor.
             - Last_Name: Last name of the contributor.
             - Email_Address: Email address of the contributor.
             - Affiliations: Affiliations of the contributor.
    :rtype: Union[str, pd.DataFrame]
    """

    return_val = "No non-author contributors were found after parsing this paper."

    non_author_contributors = root.xpath(".//contrib[not(@contrib-type='author')]")
    if len(non_author_contributors) > 0:
        non_author_tuples = _get_contributor_tuples(
            root=root, contributors=non_author_contributors
        )
        non_authors_df = pd.DataFrame(non_author_tuples)
        non_authors_df.columns = [
            "Contributor_Type",
            "First_Name",
            "Last_Name",
            "Email_Address",
            "Affiliations",
        ]
        return_val = non_authors_df

    return return_val




[docs]
def gather_abstract(
    root: ET.Element, ref_map: basicBiMap
) -> List[Union[TextSection, TextParagraph]]:
    """
    Extract all abstract text sections from an XML document and return them as
    a list of TextSections and/or TextParagraphs.

    :param ET.Element root: The root element of the PMC paper XML tree.
    :param basicBiMap ref_map: A reference map used for decoding data references
        within the text.

    :return: A list of TextSections and/or TextParagraphs representing the abstract
             text sections in the XML.
    :rtype: List[Union[TextSection, TextParagraph]]
    """
    abstract = []

    # get abstract subtree from XML
    matches = root.xpath("//abstract")
    if len(matches) > 1:
        warnings.warn(
            (
                "Warning! Multiple abstracts matched. Filling in Paper.abstract "
                "with the first match."
            ),
            unexpectedMultipleMatchWarning,
        )
    elif len(matches) == 0:
        warnings.warn("No abstract found.", unexpectedZeroMatchWarning)
        return None
    abstract_root = matches[0]

    # iterate through abstract subtree and add in text sections (recursive)
    # and text paragraphs (flat)
    for child in abstract_root.iterchildren():
        if child.tag == "sec":
            abstract.append(TextSection(sec_root=child, ref_map=ref_map))
        elif child.tag == "p":
            abstract.append(TextParagraph(p_root=child, ref_map=ref_map))
        else:
            warnings.warn(
                (
                    f"Warning! Unexpected child with of type {child.tag} found "
                    "under an XML <abstract> tag."
                )
            )

    return abstract




[docs]
def gather_body(
    root: ET.Element, ref_map: basicBiMap
) -> List[Union[TextSection, TextParagraph]]:
    """
    Extract all body text sections from an XML document and return them as
    a list of TextSections and/or TextParagraphs.

    :param ET.Element root: The root element of the PMC paper XML tree.
    :param basicBiMap ref_map: A reference map used for decoding data references
        within the text.

    :return: A list of TextSections and/or TextParagraphs representing the body
             text sections in the XML.
    :rtype: List[Union[TextSection, TextParagraph]]
    """
    body = []

    # get abstract subtree from XML
    matches = root.xpath("//body")
    if len(matches) > 1:
        warnings.warn(
            (
                "Warning! Multiple 'body's matched. "
                "Filling in Paper.body with the first match."
            ),
            unexpectedMultipleMatchWarning,
        )
    elif len(matches) == 0:
        warnings.warn(
            (
                "Warning! No <body> tag found. This paper may be abstract only, "
                "or the Open Access portion may be abstract only. This also may "
                "happen with author manuscripts and other non-final editions."
            )
        )
        return None
    body_root = matches[0]

    # iterate through body subtree and add in text sections (recursive)
    # and text paragraphs (flat)
    for child in body_root.iterchildren():
        if child.tag == "sec":
            body.append(TextSection(sec_root=child, ref_map=ref_map))
        elif child.tag == "p":
            body.append(TextParagraph(p_root=child, ref_map=ref_map))
        else:
            warnings.warn(
                (
                    f"Warning! Unexpected child with of type {child.tag} found "
                    "under an XML <body> tag."
                )
            )

    return body




[docs]
def gather_journal_id(root: ET.Element) -> dict:
    """
    Extract Journal IDs from a PMC XML document.

    :param ET.Element root: The root element of the PMC paper XML tree.

    :return: A dictionary containing Journal IDs with the ID type as keys and
             corresponding values as the ID values.
    :rtype: dict
    """
    journal_ids = root.xpath("//journal-meta/journal-id")
    id_dict = {
        journal_id.get("journal-id-type"): journal_id.text for journal_id in journal_ids
    }

    return id_dict




[docs]
def gather_journal_title(root: ET.Element) -> Union[List[str], str]:
    """
    Extract Journal Title(s) from a PMC XML document.

    :param ET.Element root: The root element of the PMC paper XML tree.

    :return: Either a string representing the Journal Title if there's only one,
             a list of strings representing multiple Journal Titles if there are
             multiple, or None if no journal title is found.
    :rtype: Union[List[str], str, None]
    """
    return_val = None
    titles = []
    title_matches = root.xpath("//journal-title")
    for title in title_matches:
        titles.append(title.text)
    # might have multiple journals & journal titles
    if len(titles) > 1:
        return_val = titles
    elif len(titles) == 0:
        warnings.warn("No journal title found.", unexpectedZeroMatchWarning)
        return_val = None
    else:
        return_val = titles[0]
    return return_val




[docs]
def gather_issn(root: ET.Element) -> dict:
    """
    Extract ISSN values from a PMC XML document.

    :param ET.Element root: The root element of the PMC paper XML tree.

    :return: A dictionary containing ISSN values with the publication type
             as keys and corresponding values as the ISSN numbers.
    :rtype: dict
    """
    issns = root.xpath("//journal-meta/issn")
    issn_dict = {issn.get("pub-type"): issn.text for issn in issns}

    return issn_dict




[docs]
def gather_publisher_name(root: ET.Element) -> Union[str, List[str]]:
    """
    Extract Publisher Name(s) from a PMC XML document.

    :param ET.Element root: The root element of the PMC paper XML tree.

    :return: Either a string representing the Publisher Name if there's only one,
             or a list of strings representing multiple Publisher Names if there are
             multiple.
    :rtype: Union[str, List[str]]
    """
    publisher_name_or_names = None
    publishers = root.xpath("//journal-meta/publisher/publisher-name")
    if len(publishers) == 1:
        publisher_name_or_names = publishers[0].text
    else:
        publisher_name_or_names = [publisher.text for publisher in publishers]
    return publisher_name_or_names




[docs]
def gather_publisher_location(root: ET.Element) -> Union[str, List[str]]:
    """
    Extract Publisher Location(s) from a PMC XML document.

    :param ET.Element root: The root element of the PMC paper XML tree.

    :return: Either a string representing the Publisher Location if there's
            only one, or a list of strings representing multiple Publisher
            Locations if there are multiple.
    :rtype: Union[str, List[str]]
    """
    publisher_loc_or_locs = None
    publisher_locs = root.xpath("//journal-meta/publisher/publisher-loc")
    if len(publisher_locs) == 1:
        publisher_loc_or_locs = publisher_locs[0].text
    else:
        publisher_loc_or_locs = [publisher_loc.text for publisher_loc in publisher_locs]
    return publisher_loc_or_locs




[docs]
def gather_article_id(root: ET.Element) -> Dict[str, str]:
    """
    Gather Article IDs from PMC XML.
    """
    article_ids = root.xpath("//article-meta/article-id")
    id_dict = {
        article_id.get("pub-id-type"): article_id.text for article_id in article_ids
    }

    return id_dict




[docs]
def gather_article_types(root: ET.Element) -> List[str]:
    """
    Extract Article Types from a PMC XML document.

    Article Types are article-categories marked by the subj-group-type 'heading'.

    :param ET.Element root: The root element of the PMC paper XML tree.

    :return: A list of strings representing the Article Types found in the XML.
    :rtype: List[str]
    """
    matches = root.xpath("//article-meta/article-categories")
    if len(matches) > 1:
        warnings.warn(
            (
                "Warning! Multiple 'article-categories' lists matched. "
                "Filling in Paper.article_categories with the first match."
            ),
            unexpectedMultipleMatchWarning,
        )
    elif len(matches) == 0:
        warnings.warn("No 'article-categories' list found.", unexpectedZeroMatchWarning)
        return None

    article_categories = matches[0]
    heading_categories = article_categories.xpath(
        "subj-group[@subj-group-type='heading']/subject"
    )
    heading_cats = [heading_cat.text for heading_cat in heading_categories]

    if not heading_cats:
        heading_cats = (
            "No article type found (No article category with "
            "subject type 'heading'). Check "
            "Paper.article_categories for other categories."
        )
    return heading_cats




[docs]
def gather_article_categories(root: ET.Element) -> List[str]:
    """
    Extract Other Article Categories from a PMC XML document.

    This function retrieves article categories that are not marked as 'heading'
    in the subj-group-type attribute.

    :param ET.Element root: The root element of the PMC paper XML tree.

    :return: A list of dictionaries containing other article categories with
             the subj-group-type as keys and corresponding category values as
             values.
    :rtype: List[Dict[str, str]]
    """
    matches = root.xpath("//article-meta/article-categories")
    if len(matches) > 1:
        warnings.warn(
            (
                "Warning! Multiple 'article-categories' lists matched. Filling "
                "in Paper.article_categories with the first match."
            ),
            unexpectedMultipleMatchWarning,
        )
    elif len(matches) == 0:
        warnings.warn("No 'article-categories' list found.", unexpectedZeroMatchWarning)
        return None
    article_categories = matches[0]
    other_categories = article_categories.xpath(
        "subj-group[not(@subj-group-type='heading')]/subject"
    )
    other_cats = [
        {other_cat.get("subj-group-type"): other_cat.text}
        for other_cat in other_categories
    ]

    if not other_cats:
        other_cats = (
            "No extra article categories found. "
            "Check paper.article_types for header categories."
        )
    return other_cats




[docs]
def gather_published_date(root: ET.Element) -> Dict[str, datetime]:
    """
    Extract Publishing Dates from a PMC XML document.

    This function gathers electronic publishing, print publishing, and other
    dates from the article metadata.

    :param ET.Element root: The root element of the PMC paper XML tree.

    :return: A dictionary containing publishing dates with the publication type
             as keys and corresponding datetime values as values.
    :rtype: Dict[str, datetime]
    """
    # TODO: update for multi-publishing (need to find an example first)

    pdate_dict = {}
    matches = root.xpath("//article-meta/pub-date")
    for match in matches:
        pub_type = match.get("pub-type")

        year = 1  # default
        year_matches = match.xpath("year/text()")
        if len(year_matches) > 0:
            year = int(year_matches[0])
        else:
            warnings.warn(
                (
                    "No year found for one of the publishing dates. "
                    "Defaulting to year = 1!"
                ),
                unexpectedZeroMatchWarning,
            )

        # if not month found, assume the 1st (standard practice - )
        month = 1
        month_matches = match.xpath("month/text()")
        if len(month_matches) > 0:
            month = int(month_matches[0])

        # if no day found, assume the 1st (standard practice)
        day = 1
        day_matches = match.xpath("day/text()")
        if len(day_matches) > 0:
            day = int(day_matches[0])

        full_date = datetime(year=year, month=month, day=day)
        pdate_dict[pub_type] = full_date
    return pdate_dict




[docs]
def gather_volume(root: ET.Element) -> str:
    """
    Extract the Volume # of the Parent Publication from a PMC XML document.

    :param ET.Element root: The root element of the PMC paper XML tree.

    :return: A string representing the Volume # of the parent publication,
             or None if no Volume # is found.
    :rtype: str or None
    """
    # TODO: update for multi-publishing (need to find an example first)

    matches = root.xpath("//article-meta/volume/text()")
    volume = None
    if len(matches) == 0:
        warnings.warn("No Volume # found for Publication.", unexpectedZeroMatchWarning)
    else:
        volume = matches[0]

    return volume




[docs]
def gather_issue(root: ET.Element) -> str:
    """
    Extract the Issue # of the Parent Publication from a PMC XML document.

    :param ET.Element root: The root element of the PMC paper XML tree.

    :return: A string representing the Issue # of the parent publication,
             or None if no Issue # is found.
    :rtype: str or None
    """
    # TODO: update for multi-publishing (need to find an example first)

    matches = root.xpath("//article-meta/issue/text()")
    issue = None
    if len(matches) == 0:
        warnings.warn("No Issue # found for Publication.", unexpectedZeroMatchWarning)
    else:
        issue = matches[0]

    return issue




[docs]
def gather_fpage(root: ET.Element) -> str:
    """
    Extract the First Page Number of this article in its parent publication
    from a PMC XML document.

    :param ET.Element root: The root element of the PMC paper XML tree.

    :return: A string representing the First Page Number of the article
             in its parent publication, or None if no First Page # is found.
    :rtype: str or None
    """
    # TODO: update for multi-publishing (need to find an example first)

    matches = root.xpath("//article-meta/fpage/text()")
    fpage = None
    if len(matches) == 0:
        warnings.warn(
            "No First Page # found for Publication.", unexpectedZeroMatchWarning
        )
    else:
        fpage = matches[0]

    return fpage




[docs]
def gather_lpage(root: ET.Element) -> str:
    """
    Extract the Last Page Number of this article in its parent publication
    from a PMC XML document.

    :param ET.Element root: The root element of the PMC paper XML tree.

    :return: A string representing the Last Page Number of the article
             in its parent publication, or None if no Last Page # is found.
    :rtype: str or None
    """
    # TODO: update for multi-publishing (need to find an example first)

    matches = root.xpath("//article-meta/lpage/text()")
    lpage = None
    if len(matches) == 0:
        warnings.warn(
            "No Last Page # found for Publication.", unexpectedZeroMatchWarning
        )
    else:
        lpage = matches[0]

    return lpage




[docs]
def gather_permissions(root: ET.Element) -> Dict[str, str]:
    """
    Extract permissions information from a PMC XML document.

    This function retrieves the copyright statement, license type, and license
    text from the article metadata.

    :param ET.Element root: The root element of the PMC paper XML tree.

    :return: A dictionary containing the following keys:
             - "Copyright Statement": A string representing the copyright statement.
             - "License Type": A string representing the license type.
             - "License Text": A string containing the license text.
    :rtype: Dict[str, str]
    """
    copyright_statement_matches = root.xpath(
        "//article-meta/permissions/copyright-statement/text()"
    )
    copyright_statement = "No copyright statement found."
    if len(copyright_statement_matches) == 0:
        warnings.warn("No copyright statement found.", unexpectedZeroMatchWarning)
    elif len(copyright_statement_matches) > 1:
        warnings.warn(
            ("Multiple copyright statements found. " "Retrieving the first statement."),
            unexpectedMultipleMatchWarning,
        )
    else:
        copyright_statement = copyright_statement_matches[0]

    license_matches = root.xpath("//article-meta/permissions/license")
    if len(license_matches) == 0:
        warnings.warn("No license found.", unexpectedZeroMatchWarning)
        return None
    elif len(license_matches) > 1:
        warnings.warn(
            "Multiple licenses found. Retrieving the first statement.",
            unexpectedMultipleMatchWarning,
        )
    license = license_matches[0]
    license_type = license.get("license-type")
    if not license_type:
        license_type = "Not Specified"
    license_text = []
    for child in license.iterchildren():
        if child.tag == "license-p":
            license_text.append(TextParagraph(p_root=child))
        else:
            warnings.warn(
                (
                    f"Warning! Unexpected child with of type {child.tag} found "
                    "under an XML <license> tag."
                )
            )
    license_text = "\n".join([str(par) for par in license_text])

    permissions_dict = {
        "Copyright Statement": copyright_statement,
        "License Type": license_type,
        "License Text": license_text,
    }
    return permissions_dict




[docs]
def gather_funding(root: ET.Element) -> List[str]:
    """
    Extract funding information from a PMC XML document.

    This function retrieves a list of funding institutions mentioned in the
    article metadata.

    :param ET.Element root: The root element of the PMC paper XML tree.

    :return: A list of strings representing funding institutions, or None if
             no funding information is found.
    :rtype: List[str] or None
    """
    matches = root.xpath("//article-meta/funding-group")
    funding_institutions = []
    for match in matches:
        institutions = match.xpath("award-group/funding-source/institution/text()")
        funding_institutions.extend([inst for inst in institutions])

    if len(funding_institutions) == 0:
        funding_institutions = None
    return funding_institutions




[docs]
def gather_footnote(root: ET.Element) -> str:
    """
    Extract footnote information from a PMC XML document.

    This function retrieves and concatenates footnotes found in the article's
    back matter.

    :param ET.Element root: The root element of the PMC paper XML tree.

    :return: A string containing the concatenated footnotes, or None if no
             footnotes are found.
    :rtype: str or None
    """
    matches = root.xpath("//back/fn-group/fn")
    footnote = ""
    for fn in matches:
        for child in fn:
            if child.tag == "p":
                if len(footnote) == 0:
                    footnote += str(TextParagraph(p_root=child))
                else:
                    footnote += " - " + str(TextParagraph(p_root=child))
            else:
                warnings.warn(
                    (
                        f"Unexpected child of type {child.tag} under a footnote "
                        "(<fn>) tag. Ignoring."
                    )
                )

    if len(footnote) == 0:
        footnote = None

    return footnote




[docs]
def gather_acknowledgements(root: ET.Element) -> Union[List[str], str]:
    """
    Extract acknowledgements information from a PMC XML document.

    This function retrieves a list of acknowledgements found in the article's
    XML tree.

    :param ET.Element root: The root element of the PMC paper XML tree.

    :return: A list of strings representing acknowledgements, or a string
             indicating that no acknowledgements were found.
    :rtype: Union[List[str], str]
    """
    matches = root.xpath("//ack")
    acknowledgements = [" ".join(match.itertext()).strip() for match in matches]

    return acknowledgements




[docs]
def gather_notes(root: ET.Element) -> str:
    """
    Extract notes information from a PMC XML document.

    This function retrieves a list of notes found in the article's XML tree.

    :param ET.Element root: The root element of the PMC paper XML tree.

    :return: A list of strings representing notes, or an empty list if no notes
             are found.
    :rtype: List[str]
    """
    notes = []
    matches = root.xpath("//notes")
    notes = [
        stringify_note(note) for note in matches if not note.getparent().tag == "notes"
    ]

    return notes




[docs]
def stringify_note(root: ET.Element) -> str:
    """
    Recursively convert a notes section into a string.

    This function traverses the XML tree of a notes section and recursively
    converts it into a string. It includes the <title>, <p>, and child <notes>
    content.

    :param ET.Element root: The root element of the notes section in the PMC
                            paper XML tree.

    :return: A string representation of the notes section.
    :rtype: str
    """
    note = ""
    for child in root.iterchildren():
        if child.tag == "title":
            note += f"Title: {child.text}\n"
        elif child.tag == "p":
            note += child.text
        elif child.tag == "notes":
            note += "\n" + textwrap.indent(stringify_note(child), " " * 4)
    note += "\n"
    return note



# def _get_note(note_root: ET.Element) ->



[docs]
def gather_custom_metadata(root: ET.Element) -> Dict[str, str]:
    """
    Extract custom metadata key-value pairs from a PMC XML document.

    This function retrieves custom metadata key-value pairs found in the
    article's XML tree. Custom metadata consists of user-defined key-value pairs.

    :param ET.Element root: The root element of the PMC paper XML tree.

    :return: A dictionary containing custom metadata key-value pairs, or None if
             no custom metadata is found.
    :rtype: Dict[str, str] or None
    """
    custom = {}
    matches = root.xpath("//custom-meta")
    for custom_meta in matches:
        meta_name = custom_meta.find("meta-name")
        if meta_name is not None:
            meta_name = meta_name.text
        meta_value = custom_meta.find("meta-value")
        meta_data = None
        if meta_value is not None:
            meta_data = " ".join(meta_value.itertext())
        if meta_data:
            if meta_name is None:
                meta_name = (
                    uuid.uuid4()
                )  # give random unique identifier if no meta key found
            custom[meta_name] = meta_data

    if len(custom) == 0:
        custom = None
    return custom



def _parse_citation(
    citation_root: ET.Element,
) -> Union[Dict[str, Union[List[str], str]], str]:
    """
    Parse citation information from a citation XML element.

    This function parses citation information from a given XML element
    representing a citation. It attempts to extract the following information:

    - Authors: List of author names.
    - Title: Title of the cited work.
    - Source: Source of the cited work (e.g., journal name).
    - Year: Year of publication.
    - Volume: Volume number (if applicable).
    - FirstPage: First page number (if applicable).
    - LastPage: Last page number (if applicable).
    - DOI: DOI (Digital Object Identifier) of the cited work (if available).
    - PMID: PubMed Identifier of the cited work (if available).

    If the function successfully extracts information, it returns a dictionary
    containing the parsed data. If no information can be extracted, it returns
    either an empty dictionary or the raw citation text (if available).

    :param ET.Element citation_root: The root element of the citation in the XML.

    :return: A dictionary containing parsed citation information or the raw
        citation text.
    :rtype: Union[Dict[str, Union[List[str], str]], str]
    """

    root = citation_root

    # Find authors in common element-citation format
    author_matches = root.xpath('.//person-group[@person-group-type="author"]/name')

    # If failed, try to find full citation in mixed-citation format-----------
    mixed_citation = None
    if len(author_matches) == 0:
        mixed_citation = root.xpath("//mixed-citation/text()")
        if len(mixed_citation) > 0:
            return str(mixed_citation[0])
    # ------------------------------------------------------------------------

    # If still failed, raise a warning.
    if len(author_matches) == 0:
        warnings.warn(
            f"No authors found in citation {root.get('id')}", unexpectedZeroMatchWarning
        )

    # tries to retrieve all of the following info, fails silently
    # if none found since many refs incomplete
    citation_dict = {
        "Authors": [
            (
                f"{_try_get_xpath_text(name, 'given-names')} "
                f"{_try_get_xpath_text(name, 'surname')}"
            )
            for name in author_matches
            if author_matches
        ],
        "Title": _try_get_xpath_text(root, ".//article-title"),
        "Source": _try_get_xpath_text(root, ".//source"),
        "Year": _try_get_xpath_text(root, ".//year"),
        "Volume": _try_get_xpath_text(root, ".//volume"),
        "FirstPage": _try_get_xpath_text(root, ".//fpage"),
        "LastPage": _try_get_xpath_text(root, ".//lpage"),
        "DOI": _try_get_xpath_text(root, './/pub-id[@pub-id-type="doi"]'),
        "PMID": _try_get_xpath_text(root, './/pub-id[@pub-id-type="pmid"]'),
    }

    return citation_dict


def _try_get_xpath_text(root: ET.Element, xpath: str, verbose=False) -> str:
    """
    Attempt to retrieve the text content of an XML element using an XPath expression.

    This function tries to find the first element matching the given XPath expression
    within the specified XML element and retrieves its text content. If successful, it
    returns the text content; otherwise, it returns None.

    :param ET.Element root: The XML element from which to retrieve the text.
    :param str xpath: The XPath expression used to locate the target element.
    :param bool verbose: If True, warnings will be issued for failed retrieval attempts.

    :return: The text content of the matching XML element or None if not found.
    :rtype: str
    """
    return_text = None
    try:
        return_text = root.find(xpath).text
    except AttributeError:
        if verbose:
            warnings.warn(
                (
                    "Failed xpath text retrieval while trying to find "
                    f"{xpath}.text(). Root ID: {root.get('id')}"
                )
            )

    return return_text


def _find_key_of_xpath(ref_map: basicBiMap, xpath_query: str) -> int:
    """
    Search for the first key in the reference map (ref_map) where the value
    matches the provided XPath query.

    This function iterates through the reference map, which associates keys with
    XPath expressions (values), and checks if the specified XPath query matches the
    value associated with each key. The first matching key found is returned.

    :param basicBiMap ref_map: A bidirectional map containing XPath expressions as
        values and associated keys.
    :param str xpath_query: The XPath query to match against the values in the ref_map.

    :return: The first key in the ref_map where the XPath query matches the value,
        or None if no match is found.
    :rtype: int or None
    """

    ref_map = copy.deepcopy(ref_map)
    # Iterate through the dictionary and find the key with matching value
    matching_key = None
    for key, value in ref_map.items():
        if len(ET.fromstring(value).xpath(xpath_query)) > 0:
            matching_key = key
            break

    return matching_key


def _clean_ref_map(paper_root: ET.Element, ref_map: basicBiMap) -> basicBiMap:
    """
    Process a reference map (ref_map) by replacing various types of references with
    their corresponding information, such as citations, tables, and figures.

    This function iterates through the reference map and processes different types of
    references found in the map. It replaces bibliography (bibr) references with actual
    citation information, table references with table data, and figure references with
    figure information. The resulting cleaned reference map contains the processed
    references.

    :param ET.Element paper_root: The root element of the paper's XML.
    :param basicBiMap ref_map: A bidirectional map containing keys and associated
        values that represent different types of references.

    :return: A cleaned reference map with references replaced by their respective
        information.
    :rtype: basicBiMap
    """
    cleaned_ref_map = {}

    for key, item in ref_map.items():
        root = ET.fromstring(item)

        # -------XREFS LINK TO ACTUAL ITEMS OR FILL WITH BIBR--------------
        # process xrefs to citations, tables, and figures
        if root.tag == "xref":
            if root.get("ref-type") == "bibr":
                ref_id = root.get("rid")
                if not ref_id:
                    warnings.warn(
                        (
                            "Citation without a reference id specified "
                            f"(Citation {root.text})!"
                        ),
                        unmatchedCitationWarning,
                    )
                    continue

                # XPath expression to find the <ref>
                # element based on the reference ID
                matching_citation_expr = f"//ref[@id='{ref_id}']"
                matches = paper_root.xpath(matching_citation_expr)
                if len(matches) == 0:
                    warnings.warn(
                        (
                            "Citation without matching reference "
                            f"(Citation {root.text})!"
                        ),
                        unmatchedCitationWarning,
                    )
                    continue
                elif len(matches) > 1:
                    warnings.warn(
                        (
                            "Multiple references found for a single citation. "
                            "Filling in with the first match."
                        )
                    )

                reference_xml = matches[0]
                cleaned_reference = _parse_citation(reference_xml)
                cleaned_ref_map[key] = cleaned_reference

            elif root.get("ref-type") == "table":
                table_id = root.get("rid")
                if not table_id:
                    warnings.warn(
                        """Table ref without reference ID, no table will
                        be matched!""",
                        unmatchedTableWarning,
                    )
                    continue

                table_xpath = f"//table-wrap[@id='{table_id}']"
                matches = paper_root.xpath(table_xpath)
                if len(matches) == 0:
                    warnings.warn(
                        (f"Table xref with rid={table_id} not " "matched in the XML!"),
                        unmatchedTableWarning,
                    )
                    continue
                elif len(matches) > 1:
                    warnings.warn(
                        (
                            "Multiple references found for a single table. "
                            "Filling in with the first match."
                        )
                    )
                table_root = matches[0]
                cleaned_ref_map[key] = TextTable(table_root=table_root)

            elif root.get("ref-type") == "fig":
                fig_id = root.get("rid")
                if not fig_id:
                    warnings.warn(
                        (
                            "Figure ref unmatched. Figure ref without matching "
                            f"figure (Figure {root.text})!"
                        ),
                        unmatchedFigureWarning,
                    )
                    continue

                fig_xpath = f"//fig[@id='{fig_id}']"
                matches = paper_root.xpath(fig_xpath)
                if len(matches) == 0:
                    warnings.warn(
                        (f"Figure xref with rid={fig_id} not matched " "in the XML!"),
                        unmatchedFigureWarning,
                    )
                    continue
                elif len(matches) > 1:
                    warnings.warn(
                        (
                            "Multiple references found for a single figure. "
                            "Filling in with the first match."
                        )
                    )
                fig_root = matches[0]
                cleaned_ref_map[key] = TextFigure(fig_root=fig_root)

            elif root.get("ref-type"):
                warnings.warn(
                    (
                        f"Unknown reference type: {root.get('ref_type')} "
                        "found in ref_map."
                    )
                )
            else:
                warnings.warn(
                    (
                        "<xref> in ref_map with no ref-type specified. "
                        f"Ignoring. ({root.text})"
                    )
                )

        # process tables that are directly in the ref map
        elif root.tag == "table-wrap":
            cleaned_ref_map[key] = TextTable(table_root=root)
        # process figures that are directly in the ref map
        elif root.tag == "fig":
            cleaned_ref_map[key] = TextFigure(fig_root=root)
        else:
            warnings.warn(
                (
                    f"Unexpected tag of type {root.tag} found in ref map. "
                    "Leaving as is instead of cleaning."
                )
            )
            cleaned_ref_map[key] = ET.tostring(root)

    # Final pass to set up links now that everything should be filled in
    for key, item in cleaned_ref_map.items():
        if isinstance(item, int):
            link_index = item
            cleaned_ref_map[key] = cleaned_ref_map[link_index]

    return cleaned_ref_map


def _get_ref_type(value):
    """
    Determine the type of reference (table, citation, or figure) based on the value
    in the reference map.

    This function examines the value to identify the type of reference it represents.
    It checks if the value is a dictionary with a "Caption" key (indicating a figure),
    a dictionary with "Authors" key (indicating a citation), an instance of TextFigure
    (indicating a figure), or an instance of TextTable (indicating a table).

    :param value: The value representing a reference in the reference map.

    :return: A string indicating the type of reference (table, citation, or figure),
        or None if the type cannot be determined.
    :rtype: str or None
    """
    ref_type = None
    if isinstance(value, dict):
        if "Caption" in value:
            ref_type = "fig"
        elif "Authors" in value:
            ref_type = "citation"
    elif isinstance(value, str):
        # if string, probably a citation scraped via
        # the mixed citation element parsing
        ref_type = "citation"
    elif isinstance(value, TextFigure):
        ref_type = "fig"
    elif isinstance(value, TextTable):
        ref_type = "table"

    return ref_type


def _get_unique_tables(table_list: List[pd.DataFrame]) -> List[dict]:
    """
    TODO: Given a set of tables (pd.DataFrame or Stylers), return a unique set.
    """
    return table_list


def _get_unique_citations(citation_list: List[dict]) -> List[dict]:
    """
    TODO: Given a set of citations, return a unique set
    based on citation['PMID']
    """
    return citation_list


def _get_unique_figures(fig_list: List[dict]) -> List[dict]:
    """
    TODO: Given a set of figures, return a unique set of figures
    based on fig['Label']
    """
    return fig_list


def _split_citations_tables_figs(
    ref_map: basicBiMap,
) -> Tuple[
    Set[Union[Dict[str, Union[List[str], str]], str]],
    Set[pd.DataFrame],
    Set[Dict[str, str]],
]:
    """
    Split the reference map into three separate sets: citations, tables, and figures.

    This function iterates through the reference map and categorizes each reference
    based on its type (citation, table, or figure). It returns three sets containing
    citations (as dictionaries or strings), tables (as pandas DataFrames),
    and figures (as dictionaries).

    :param ref_map: The reference map to be split into citations, tables, and figures.
    :type ref_map: basicBiMap

    :return: A tuple containing three sets:
        - The first set contains citations, each represented as a dictionary or string.
        - The second set contains tables, each as a pandas DataFrame.
        - The third set contains figures, each represented as a dictionary.
    :rtype: tuple
    """
    citations = []
    tables = []
    figures = []
    for i, ref in ref_map.items():
        if _get_ref_type(ref) == "citation":
            citations.append(ref)
        elif _get_ref_type(ref) == "table":
            tables.append(ref.df)
        elif _get_ref_type(ref) == "fig":
            figures.append(ref.fig_dict)
        else:
            warnings.warn(
                f"Issue finding Reference type for index {i} in reference map."
            )

    return (
        _get_unique_citations(citations),
        _get_unique_tables(tables),
        _get_unique_figures(figures),
    )


# ----------------END GENERATE PAPER DICTIONARY GIVEN PMCID----------------