Source code for ietfbib2bibtex.sources

#!/usr/bin/env python3

# Copyright (C) 2022-23 Freie Universität Berlin
# Copyright (C) 2023-24 TU Dresden
# Copyright (C) 2023-24 HAW Hamburg
#
# This file is subject to the terms and conditions of the GNU Lesser
# General Public License v2.1. See the file LICENSE in the top level
# directory for more detail

"""Bibliography sources"""

import abc
import glob
import logging
import os
import re
import subprocess

import requests
import lxml.etree
import pybtex.database

from . import config

__author__ = "Martine S. Lenders"
__copyright__ = "Copyright 2022 Freie Universität Berlin"
__license__ = "LGPL v2.1"
__email__ = "m.lenders@fu-berlin.de"


[docs] class Source(abc.ABC): """Base class for a bibliography source.""" @property @abc.abstractmethod def remote(self): """The remote resource of the bibliography source.""" raise NotImplementedError() # pragma: no cover
[docs] @abc.abstractmethod def iterate_entries(self): """Iterate over all valid entries of the bibliography source.""" raise NotImplementedError() # pragma: no cover
[docs] class RFCIndexSource(Source): """rfc-index.xml source.""" def __init__(self, rfc_index_config: config.RFCIndexSource): self._config = rfc_index_config @property def remote(self): return self._config.remote
[docs] def iterate_entries(self): response = requests.get(self.remote, timeout=5) root = lxml.etree.fromstring(response.content) for element in root.iter("{https://www.rfc-editor.org/rfc-index}rfc-entry"): doc_id = element.find("{https://www.rfc-editor.org/rfc-index}doc-id").text if not re.match(r"RFC\d+", doc_id): # erroneous tagging continue title = element.find("{https://www.rfc-editor.org/rfc-index}title").text yield re.sub( r"(RFC)0*([1-9][0-9]*)", r"\1-\2", doc_id ), pybtex.database.Entry( "techreport", { "title": f"{{{title}}}", "institution": "IETF", "type": "RFC", "number": re.sub(r"RFC0*([1-9][0-9]*)", r"\1", doc_id), "month": ( element.find("{https://www.rfc-editor.org/rfc-index}date") .find("{https://www.rfc-editor.org/rfc-index}month") .text ), "year": ( element.find("{https://www.rfc-editor.org/rfc-index}date") .find("{https://www.rfc-editor.org/rfc-index}year") .text ), "doi": ( element.find("{https://www.rfc-editor.org/rfc-index}doi").text ), # pylint: disable=consider-using-f-string "url": "https://doi.org/{}".format( element.find("{https://www.rfc-editor.org/rfc-index}doi").text ), }, persons={ "author": [ pybtex.database.Person( e.find("{https://www.rfc-editor.org/rfc-index}name").text ) for e in element.findall( "{https://www.rfc-editor.org/rfc-index}author" ) ], }, )
[docs] class BibXMLIDsSource(Source): """rsync://rsync.ietf.org/bibxml-ids/ source.""" def __init__(self, bibxml_ids_source_config: config.BibXMLIDsSource): self._config = bibxml_ids_source_config @property def remote(self): return self._config.remote @property def local(self): """The directory for the bibliography source.""" return self._config.local
[docs] def iterate_entries(self): subprocess.check_call(["rsync", "-avcizxL", self.remote, self.local]) last_unversioned = None last_entry = None for xml_filename in sorted(glob.iglob(os.path.join(self.local, "*[0-9].xml"))): with open( xml_filename, encoding="utf-8", errors="xmlcharrefreplace" ) as xml: try: tree = lxml.etree.parse(xml) except lxml.etree.XMLSyntaxError as exc: logging.error("%s, ignoring %s", exc, xml_filename) continue root = tree.getroot() front = root.find("front") series_info = root.find("seriesInfo") number = re.sub(r".*-(\d{2})$", r"\1", series_info.get("value")) unversioned = re.sub(r"(.*)-\d{2}$", r"\1", series_info.get("value")) try: data = { "title": f"{{{front.find('title').text}}}", "institution": "IETF", "type": series_info.get("name") + ( " -- work in progress" if series_info.get("name") == "Internet-Draft" else "" ), "number": number, "month": front.find("date").get("month"), "year": front.find("date").get("year"), } if root.get("target"): data["url"] = root.get("target") entry = pybtex.database.Entry( "techreport", data, persons={ "author": [ pybtex.database.Person(e.get("fullname")) for e in front.findall("author") ], }, ) except pybtex.database.InvalidNameString as exc: logging.error( "%s in author fullname, ignoring %s", exc, xml_filename ) continue if last_unversioned != unversioned and last_entry is not None: yield last_unversioned, last_entry yield series_info.get("value"), entry last_unversioned = unversioned last_entry = entry if last_unversioned is not None and last_entry is not None: yield last_unversioned, last_entry