Source code for ietfbib2bibtex.sources

#!/usr/bin/env python3

# Copyright (C) 2022-23 Freie Universität Berlin
# Copyright (C) 2023-24 TU Dresden
# Copyright (C) 2023-24 HAW Hamburg
#
# This file is subject to the terms and conditions of the GNU Lesser
# General Public License v2.1. See the file LICENSE in the top level
# directory for more detail

"""Bibliography sources"""

import abc
import glob
import logging
import os
import re
import subprocess

import requests
import lxml.etree
import pybtex.database

from . import config

__author__ = "Martine S. Lenders"
__copyright__ = "Copyright 2022 Freie Universität Berlin"
__license__ = "LGPL v2.1"
__email__ = "m.lenders@fu-berlin.de"



[docs]
class Source(abc.ABC):
    """Base class for a bibliography source."""

    @property
    @abc.abstractmethod
    def remote(self):
        """The remote resource of the bibliography source."""
        raise NotImplementedError()  # pragma: no cover


[docs]
    @abc.abstractmethod
    def iterate_entries(self):
        """Iterate over all valid entries of the bibliography source."""
        raise NotImplementedError()  # pragma: no cover





[docs]
class RFCIndexSource(Source):
    """rfc-index.xml source."""

    def __init__(self, rfc_index_config: config.RFCIndexSource):
        self._config = rfc_index_config

    @property
    def remote(self):
        return self._config.remote


[docs]
    def iterate_entries(self):
        response = requests.get(self.remote, timeout=5)
        root = lxml.etree.fromstring(response.content)

        for element in root.iter("{https://www.rfc-editor.org/rfc-index}rfc-entry"):
            doc_id = element.find("{https://www.rfc-editor.org/rfc-index}doc-id").text
            if not re.match(r"RFC\d+", doc_id):
                # erroneous tagging
                continue
            title = element.find("{https://www.rfc-editor.org/rfc-index}title").text
            yield re.sub(
                r"(RFC)0*([1-9][0-9]*)", r"\1-\2", doc_id
            ), pybtex.database.Entry(
                "techreport",
                {
                    "title": f"{{{title}}}",
                    "institution": "IETF",
                    "type": "RFC",
                    "number": re.sub(r"RFC0*([1-9][0-9]*)", r"\1", doc_id),
                    "month": (
                        element.find("{https://www.rfc-editor.org/rfc-index}date")
                        .find("{https://www.rfc-editor.org/rfc-index}month")
                        .text
                    ),
                    "year": (
                        element.find("{https://www.rfc-editor.org/rfc-index}date")
                        .find("{https://www.rfc-editor.org/rfc-index}year")
                        .text
                    ),
                    "doi": (
                        element.find("{https://www.rfc-editor.org/rfc-index}doi").text
                    ),
                    # pylint: disable=consider-using-f-string
                    "url": "https://doi.org/{}".format(
                        element.find("{https://www.rfc-editor.org/rfc-index}doi").text
                    ),
                },
                persons={
                    "author": [
                        pybtex.database.Person(
                            e.find("{https://www.rfc-editor.org/rfc-index}name").text
                        )
                        for e in element.findall(
                            "{https://www.rfc-editor.org/rfc-index}author"
                        )
                    ],
                },
            )





[docs]
class BibXMLIDsSource(Source):
    """rsync://rsync.ietf.org/bibxml-ids/ source."""

    def __init__(self, bibxml_ids_source_config: config.BibXMLIDsSource):
        self._config = bibxml_ids_source_config

    @property
    def remote(self):
        return self._config.remote

    @property
    def local(self):
        """The directory for the bibliography source."""
        return self._config.local


[docs]
    def iterate_entries(self):
        subprocess.check_call(["rsync", "-avcizxL", self.remote, self.local])
        last_unversioned = None
        last_entry = None
        for xml_filename in sorted(glob.iglob(os.path.join(self.local, "*[0-9].xml"))):
            with open(
                xml_filename, encoding="utf-8", errors="xmlcharrefreplace"
            ) as xml:
                try:
                    tree = lxml.etree.parse(xml)
                except lxml.etree.XMLSyntaxError as exc:
                    logging.error("%s, ignoring %s", exc, xml_filename)
                    continue
                root = tree.getroot()
                front = root.find("front")
                series_info = root.find("seriesInfo")
                number = re.sub(r".*-(\d{2})$", r"\1", series_info.get("value"))
                unversioned = re.sub(r"(.*)-\d{2}$", r"\1", series_info.get("value"))
                try:
                    data = {
                        "title": f"{{{front.find('title').text}}}",
                        "institution": "IETF",
                        "type": series_info.get("name")
                        + (
                            " -- work in progress"
                            if series_info.get("name") == "Internet-Draft"
                            else ""
                        ),
                        "number": number,
                        "month": front.find("date").get("month"),
                        "year": front.find("date").get("year"),
                    }
                    if root.get("target"):
                        data["url"] = root.get("target")
                    entry = pybtex.database.Entry(
                        "techreport",
                        data,
                        persons={
                            "author": [
                                pybtex.database.Person(e.get("fullname"))
                                for e in front.findall("author")
                            ],
                        },
                    )
                except pybtex.database.InvalidNameString as exc:
                    logging.error(
                        "%s in author fullname, ignoring %s", exc, xml_filename
                    )
                    continue
                if last_unversioned != unversioned and last_entry is not None:
                    yield last_unversioned, last_entry
                yield series_info.get("value"), entry
                last_unversioned = unversioned
                last_entry = entry
        if last_unversioned is not None and last_entry is not None:
            yield last_unversioned, last_entry