from collections import defaultdict import csv from dataclasses import dataclass import datetime from functools import total_ordering import io import json from pathlib import Path import tarfile import urllib.request from jinja2 import Environment, FileSystemLoader, StrictUndefined, select_autoescape @total_ordering @dataclass() class Version: major: int minor: int patch: int pre_release: str def __init__(self, text): text = text.split('+')[0] core, self.pre_release = (text.split('-') + [''])[:2] self.major, self.minor, self.patch = [int(x) for x in core.split('.')] def __str__(self): pre_release = self.pre_release if len(pre_release) > 0: pre_release = '-' + pre_release return '{}.{}.{}{}'.format(self.major, self.minor, self.patch, pre_release) def __lt__(self, other: 'Version') -> bool: if self.major != other.major: return self.major < other.major if self.minor != other.minor: return self.minor < other.minor if self.patch != other.patch: return self.patch < other.patch if self.pre_release == '' and other.pre_release != '': return False if self.pre_release != '' and other.pre_release == '': return True def pre_release_lt(a: str, b: str): if len(a) == 0 and len(b) != 0: return True if len(b) == 0: return False a1, an = (a.split('.', 2) + [''])[:2] b1, bn = (b.split('.', 2) + [''])[:2] try: a1, b1 = int(a1), int(b1) except ValueError: pass if a1 < b1: return True elif a1 > b1: return False else: return pre_release_lt(an, bn) return pre_release_lt(self.pre_release, other.pre_release) @property def is_1_0(self): return self.major >= 1 @dataclass() class Crate: name: str downloads: int latest_version: Version = None latest_pre_release_version: Version = None today = datetime.date.today().strftime('%Y-%m-%d') dump_tarball = Path(f'db-dump-{today}.tar.gz') if not dump_tarball.exists(): with urllib.request.urlopen('https://static.crates.io/db-dump.tar.gz') as f: dump_tarball.write_bytes(f.read()) csv.field_size_limit(69696969) dump = tarfile.open(dump_tarball) crates = defaultdict(lambda: Crate('', -1)) metadata = None for item in dump: if item.name.endswith('metadata.json'): metadata = json.load(dump.extractfile(item)) elif item.name.endswith('crates.csv'): reader = csv.DictReader(io.TextIOWrapper(dump.extractfile(item), 'UTF-8')) for crate in reader: crates[crate['id']].name = crate['name'] crates[crate['id']].downloads = int(crate['downloads']) elif item.name.endswith('versions.csv'): reader = csv.DictReader(io.TextIOWrapper(dump.extractfile(item), 'UTF-8')) for version in reader: if version['yanked'] == 't': continue crate = crates[version['crate_id']] this_version = Version(version['num']) if crate.latest_pre_release_version is None or crate.latest_pre_release_version < this_version: crate.latest_pre_release_version = this_version if this_version.pre_release == '': if crate.latest_version is None or crate.latest_version < this_version: crate.latest_version = this_version versions = list(reader) most_downloaded_crates = sorted(crates.values(), key=lambda x: x.downloads, reverse=True) crates = most_downloaded_crates[:360] print('{}/{} crates at or above version 1.0'.format(sum(1 for crate in crates if crate.latest_version.is_1_0), len(crates))) env = Environment( loader=FileSystemLoader('.'), autoescape=select_autoescape(['html', 'xml']), undefined=StrictUndefined ) index_template = env.get_template('index.html') rendered_index = index_template.render(crates=crates, metadata=metadata) out_file = Path('out', 'index.html') out_file.parent.mkdir(parents=True, exist_ok=True) with open(out_file, 'w') as f: f.write(rendered_index)