# -*- encoding: utf-8 -*- from dataclasses import dataclass from enum import Enum import re import typing as T from lxml import etree as et TIME_RE = re.compile(r"\d\d?:\d\d") AIRPORT_RE = re.compile(r"[\w\d\s\S]+") STATUS_RE = re.compile(r"(Arrivato|In Arrivo|Schedulato|Cancellato)") PARSER = et.HTMLParser() def not_empty(obj: et._Element) -> bool: if type(obj) is et._Element: children = len(obj.xpath(".//h5")) return children == 5 or children == 6 raise RuntimeError(f"provided argument is of unsupported type: {type(obj)}") def find_table(html_content: T.Text) -> T.List[et._ElementTree]: """ Find the table that holds the data in the html response """ root = et.fromstring(html_content, parser=PARSER) tbody = root.xpath("//tbody[contains(@class, 'table-data')]") if not tbody: return [] if len(tbody) != 1: raise ValueError(f"Unexpected parsing result: found {len(tbody)} results") result = [child for child in tbody[0].xpath(".//tr") if not_empty(child)] return result class Status(Enum): ARRIVED = "Arrivato" ARRIVING = "In Arrivo" SCHEDULED = "Schedulato" CANCELED = "Cancellato" UNKNOWN = "Sconosciuto" @classmethod def from_str(cls, text: T.Text) -> "Status": if "Arrivato" in text: return cls.ARRIVED elif "In Arrivo" in text: return cls.ARRIVING elif "Schedulato" in text: return cls.SCHEDULED elif "Cancellato" in text: return cls.CANCELED else: return cls.UNKNOWN @dataclass class Details(object): th_arrival: T.Optional[T.Text] = None real_arrival: T.Optional[T.Text] = None code: T.Optional[T.Text] = None origin: T.Optional[T.Text] = None status: Status = Status.UNKNOWN def maybe_parse_hour(self, h5: et._ElementTree) -> None: """ This function fills the fileds related to the arrival hour, if the input matches some heuristics. """ hour = TIME_RE.findall(h5.text) if len(hour) == 1: if "text-decoration: line-through" in h5.attrib.get("style", ""): self.th_arrival = hour[0] else: self.real_arrival = hour[0] def maybe_parse_code(self, h5: et._ElementTree) -> None: """ This function fills the fileds related to the flight code, if present and the input matches some heuristics. """ code = h5.text.strip("\t\n ") if len(code) > 0 and "flight-numb" in h5.attrib.get("class", ""): self.code = code def maybe_parse_airport(self, h5: et._ElementTree) -> None: """ This function fills the field for the airport, if the input matches some heuristics. """ airport = h5.text.strip("\t\n") if len(airport) > 0 and "blue-title" in h5.attrib.get("class", ""): self.origin = airport def maybe_parse_status(self, h5: et._ElementTree) -> None: """ This function fills the filed for the status, if the input matches some heuristics. """ _class = h5.attrib.get("class", "") if "arrivato" in _class or "schedulato" in _class: self.status = Status.from_str(h5.text) else: parsed = STATUS_RE.findall(h5.text) if len(parsed) == 1: self.status = Status.from_str(parsed[0]) def __str__(self) -> T.Text: res: T.Dict[T.Text, T.Optional[T.Text]] = {} if self.th_arrival: res["theoric"] = self.th_arrival res["real"] = self.real_arrival if self.code: res["code"] = self.code res["origin"] = self.origin res["status"] = self.status.value desc = ",".join([f"{k}={v}" for k, v in res.items()]) return f"Detail<{desc}>" def get_details(table_entry: et._ElementTree, debug: bool = False) -> Details: """ Find the dates in a table row. If a strikenthrough time is found, it is returned as second element in the tuple. """ res = table_entry.xpath(".//h5") if len(res) > 6: raise ValueError(f"Unexpected number of h5 found in line: {len(res)}") if debug: for r in res: txt = r.text.strip("\t\n ") print(f"[DEBUG] text={txt} attrs={r.attrib}") d = Details() if len(res) == 5: d.maybe_parse_hour(res[0]) d.maybe_parse_code(res[1]) d.maybe_parse_airport(res[2]) d.maybe_parse_status(res[3]) elif len(res) == 6: d.maybe_parse_hour(res[0]) d.maybe_parse_hour(res[1]) d.maybe_parse_code(res[2]) d.maybe_parse_airport(res[3]) d.maybe_parse_status(res[4]) return d