pdfbox version ソート

stackoverflow.com

import requests
from bs4 import BeautifulSoup
from packaging.version import parse as parseVersion

r = requests.get("https://archive.apache.org/dist/pdfbox/")
r.raise_for_status()

soup = BeautifulSoup(r.content, "html.parser")

versions = [i.get("href").rstrip("/") for i in soup.select('a[href^="2."]') if "RC" not in i.get("href")]

versions.sort(key = parseVersion)

versions
import pandas as pd

df = pd.DataFrame(versions, columns=["version"])

df[["major", "minor", "patch"]] = df["version"].str.split(".", expand=True).astype(int)

df.sort_values(by=["major", "minor", "patch"], inplace=True)

latest = df.iloc[-1]["version"]

url = f"https://archive.apache.org/dist/pdfbox/{latest}/pdfbox-app-{latest}.jar"

url