in antlir/rpm/parse_repodata.py [0:0]
def feed(self, chunk: bytes) -> Iterator[Rpm]:
while chunk:
# Consume the decompressed data in small chunks. This prevents
# us from using unbounded amounts of RAM for decompression.
# More crucially, apparently XMLPullParser gets up to 50% slower
# on package data if we feed it larger chuks. This buffer size
# was picked experimentally :)
#
# NB: zlib appears to copy bytes into `unconsumed_tail` instead
# of using something like `memoryview`, so this has poor
# theoretical complexity due to all the extra copying. I could
# add an extra layer of input chunking to mitigate this, but in
# practice it seems ok to just limit the incoming chunk size.
self.xml_parser.feed(
self.decompressor.decompress(chunk, max_length=2 ** 14)
)
chunk = self.decompressor.unconsumed_tail
for _, elt in self.xml_parser.read_events():
m = self.tag_re.match(elt.tag)
if not m:
continue
# Keep these `elif` clauses in _KNOWN_TAGS order
elif m.group(2) == self._NAME:
self._package[self._NAME] = elt.text
elif m.group(2) == self._VERSION:
self._package[self._VERSION] = tuple(
elt.attrib[x] for x in ("epoch", "ver", "rel")
)
elif m.group(2) == self._ARCH:
self._package[self._ARCH] = elt.text
elif m.group(2) == self._CHECKSUM:
assert elt.attrib["pkgid"] == "YES"
self._package[self._CHECKSUM] = Checksum(
algorithm=elt.attrib["type"],
# pyre-fixme[6]: Expected `str` for 2nd param but got
# `Optional[str]`.
hexdigest=elt.text,
)
elif m.group(2) == self._LOCATION:
self._package[self._LOCATION] = elt.attrib["href"]
elif m.group(2) == self._PACKAGE:
epoch, version, release = self._package[self._VERSION]
yield Rpm(
# Keep these kwargs in _KNOWN_TAGS order
epoch=int(epoch),
name=self._package[self._NAME],
version=version,
release=release,
arch=self._package[self._ARCH],
checksum=self._package[self._CHECKSUM],
location=self._package[self._LOCATION],
size=int(self._package[self._SIZE]),
source_rpm=self._package[self._SOURCE_RPM],
build_timestamp=int(self._package[self._TIME]),
# This is set after we download the RPM
# pyre-fixme[6]: Expected `Checksum` for 11th param but
# got `None`.
canonical_checksum=None,
)
self._package = {} # Detect missing fields
elt.clear() # Uses less RAM, speeds up the run 50%
elif m.group(2) == self._SIZE:
self._package[self._SIZE] = elt.attrib["package"]
elif m.group(2) == self._SOURCE_RPM:
self._package[self._SOURCE_RPM] = elt.text or None
elif m.group(2) == self._TIME:
self._package[self._TIME] = elt.attrib["build"]