From 31f2636dd84c705e3014776d3bf29a620346d9db Mon Sep 17 00:00:00 2001 From: Blallo Date: Sat, 17 Feb 2024 16:33:34 +0100 Subject: [PATCH] Fix parsing --- latecomers/parse.py | 104 ++++++++++++++++++++++---------------------- 1 file changed, 51 insertions(+), 53 deletions(-) diff --git a/latecomers/parse.py b/latecomers/parse.py index 6020a81..2bce84b 100644 --- a/latecomers/parse.py +++ b/latecomers/parse.py @@ -20,8 +20,8 @@ logger = logging.getLogger(__name__) def not_empty(obj: et._Element) -> bool: if type(obj) is et._Element: - children = len(obj.xpath(".//h5")) - return children == 5 or children == 6 + children = len(obj.xpath(".//td")) + return children in (5, 6) raise RuntimeError(f"provided argument is of unsupported type: {type(obj)}") @@ -32,14 +32,18 @@ def find_table(html_content: T.Text) -> T.List[et._ElementTree]: Find the table that holds the data in the html response """ root = et.fromstring(html_content, parser=PARSER) - tbody = root.xpath("//tbody[contains(@class, 'table-data')]") + tbody = root.xpath("//tbody") if not tbody: return [] if len(tbody) != 1: raise ValueError(f"Unexpected parsing result: found {len(tbody)} results") - result = [child for child in tbody[0].xpath(".//tr") if not_empty(child)] + result = [ + child + for child in tbody[0].xpath(".//tr[contains(@data-qa-id, 'row')]") + if not_empty(child) + ] return result @@ -74,56 +78,67 @@ class Details(object): status: Status = Status.UNKNOWN fr24_landing_time: T.Optional[T.Text] = None - def maybe_parse_hour_th(self, h5: et._ElementTree) -> None: + def __init__(self, row: et._ElementTree) -> None: + self.row = row + + def maybe_parse_hour_th(self) -> None: """ This function fills the fileds related to the theoric arrival hour, if the input matches some heuristics. """ - hour = TIME_RE.findall(h5.text) - if len(hour) == 1: - self.th_arrival = hour[0] - if "text-decoration: line-through" in h5.attrib.get("style", ""): - self.real_arrival = None + hour = self.row.xpath(".//span[contains(@class, 'date-estimated__time')]") + if len(hour) != 1: + logger.debug("Cannot parse estimated time") + return + self.th_arrival = hour[0].text - def maybe_parse_hour_real(self, h5: et._ElementTree) -> None: + def maybe_parse_hour_real(self) -> None: """ This function fills the fileds related to the theoric arrival hour, if the input matches some heuristics. """ - hour = TIME_RE.findall(h5.text) - if len(hour) == 1: - self.real_arrival = hour[0] + hour = self.row.xpath(".//span[contains(@class, 'date-actual__time')]") + if len(hour) != 1: + logger.debug("Cannot parse actual time") + return + self.real_arrival = hour[0].text - def maybe_parse_code(self, h5: et._ElementTree) -> None: + def maybe_parse_code(self) -> None: """ This function fills the fileds related to the flight code, if present and the input matches some heuristics. """ - if "flight-numb" not in h5.attrib.get("class", ""): + code = self.row.xpath(".//td[contains(@class, 'lfr-departure-column-column')]//div/a/strong") + if len(code) != 1: + logger.debug("Cannot parse code") return - child = h5.xpath(".//strong") - if len(child) == 1: - self.code = child[0].text.strip("\t\n ").replace(" ", "") + self.code = code[0].text.strip("\t\n ").replace(" ", "") - def maybe_parse_airport(self, h5: et._ElementTree) -> None: + def maybe_parse_airport(self) -> None: """ This function fills the field for the airport, if the input matches some heuristics. """ - airport = h5.text.strip("\t\n") - if len(airport) > 0 and "blue-title" in h5.attrib.get("class", ""): - self.origin = airport + airport = self.row.xpath(".//td[contains(@class, 'lfr-flight-departure-column')]/h5") + if len(airport) != 1: + logger.debug("Cannot parse airport") + return + self.origin = airport[0].text.strip("\t\n") - def maybe_parse_status(self, h5: et._ElementTree) -> None: + def maybe_parse_status(self) -> None: """ This function fills the filed for the status, if the input matches some heuristics. """ - _class = h5.attrib.get("class", "") + status = self.row.xpath(".//td[contains(@class, 'lfr-flight-status-column')]/h5") + if len(status) != 1: + logger.debug("Cannot parse status") + return + _class = status[0].attrib.get("class", "") if "arrivato" in _class or "schedulato" in _class: - self.status = Status.from_str(h5.text) + self.status = Status.from_str(status[0].text) else: - parsed = STATUS_RE.findall(h5.text) + parsed = STATUS_RE.findall(status[0].text) if len(parsed) == 1: self.status = Status.from_str(parsed[0]) @@ -133,6 +148,7 @@ class Details(object): only FlightRadar24 data). """ if not self.code: + logger.debug("Cannot add aux data: missing code") return self.fr24_landing_time = aux_data.get(self.code) @@ -156,34 +172,18 @@ class Details(object): def get_details( table_entry: et._ElementTree, aux_data: T.Optional[T.Dict[T.Text, T.Text]] = None, - debug: bool = False, ) -> Details: """ Find the dates in a table row. If a strikenthrough time is found, it is returned as second element in the tuple. """ - res = table_entry.xpath(".//h5") - if len(res) > 6: - raise ValueError(f"Unexpected number of h5 found in line: {len(res)}") + d = Details(table_entry) - if debug: - for r in res: - txt = r.text.strip("\t\n ") - print(f"[DEBUG] text={txt} attrs={r.attrib}") - - d = Details() - - if len(res) == 5: - d.maybe_parse_hour_th(res[0]) - d.maybe_parse_code(res[1]) - d.maybe_parse_airport(res[2]) - d.maybe_parse_status(res[3]) - elif len(res) == 6: - d.maybe_parse_hour_th(res[0]) - d.maybe_parse_hour_real(res[1]) - d.maybe_parse_code(res[2]) - d.maybe_parse_airport(res[3]) - d.maybe_parse_status(res[4]) + d.maybe_parse_hour_th() + d.maybe_parse_hour_real() + d.maybe_parse_code() + d.maybe_parse_airport() + d.maybe_parse_status() if aux_data: d.maybe_add_aux_data(aux_data) @@ -192,7 +192,7 @@ def get_details( def parse_fr24( - data: T.Optional[T.Dict[T.Text, T.Any]] + data: T.Optional[T.Dict[T.Text, T.Any]], ) -> T.Optional[T.Dict[T.Text, T.Text]]: """ This function parses the given FlightRadar24 data into a pandas DataFrame. @@ -206,9 +206,7 @@ def parse_fr24( results = {} for flight in data["result"]["response"]["airport"]["pluginData"]["schedule"][ "arrivals" - ][ - "data" - ]: # noqa: E501 + ]["data"]: # noqa: E501 try: id_num = flight["flight"]["identification"]["number"] if _code := id_num.get("default"):