Fix parsing

This commit is contained in:
sfigato 2024-02-17 16:33:34 +01:00
parent fa0023d2d1
commit 31f2636dd8
Signed by: blallo
GPG Key ID: C530464EEDCF489A

View File

@ -20,8 +20,8 @@ logger = logging.getLogger(__name__)
def not_empty(obj: et._Element) -> bool: def not_empty(obj: et._Element) -> bool:
if type(obj) is et._Element: if type(obj) is et._Element:
children = len(obj.xpath(".//h5")) children = len(obj.xpath(".//td"))
return children == 5 or children == 6 return children in (5, 6)
raise RuntimeError(f"provided argument is of unsupported type: {type(obj)}") raise RuntimeError(f"provided argument is of unsupported type: {type(obj)}")
@ -32,14 +32,18 @@ def find_table(html_content: T.Text) -> T.List[et._ElementTree]:
Find the table that holds the data in the html response Find the table that holds the data in the html response
""" """
root = et.fromstring(html_content, parser=PARSER) root = et.fromstring(html_content, parser=PARSER)
tbody = root.xpath("//tbody[contains(@class, 'table-data')]") tbody = root.xpath("//tbody")
if not tbody: if not tbody:
return [] return []
if len(tbody) != 1: if len(tbody) != 1:
raise ValueError(f"Unexpected parsing result: found {len(tbody)} results") raise ValueError(f"Unexpected parsing result: found {len(tbody)} results")
result = [child for child in tbody[0].xpath(".//tr") if not_empty(child)] result = [
child
for child in tbody[0].xpath(".//tr[contains(@data-qa-id, 'row')]")
if not_empty(child)
]
return result return result
@ -74,56 +78,67 @@ class Details(object):
status: Status = Status.UNKNOWN status: Status = Status.UNKNOWN
fr24_landing_time: T.Optional[T.Text] = None fr24_landing_time: T.Optional[T.Text] = None
def maybe_parse_hour_th(self, h5: et._ElementTree) -> None: def __init__(self, row: et._ElementTree) -> None:
self.row = row
def maybe_parse_hour_th(self) -> None:
""" """
This function fills the fileds related to the theoric arrival hour, This function fills the fileds related to the theoric arrival hour,
if the input matches some heuristics. if the input matches some heuristics.
""" """
hour = TIME_RE.findall(h5.text) hour = self.row.xpath(".//span[contains(@class, 'date-estimated__time')]")
if len(hour) == 1: if len(hour) != 1:
self.th_arrival = hour[0] logger.debug("Cannot parse estimated time")
if "text-decoration: line-through" in h5.attrib.get("style", ""): return
self.real_arrival = None self.th_arrival = hour[0].text
def maybe_parse_hour_real(self, h5: et._ElementTree) -> None: def maybe_parse_hour_real(self) -> None:
""" """
This function fills the fileds related to the theoric arrival hour, This function fills the fileds related to the theoric arrival hour,
if the input matches some heuristics. if the input matches some heuristics.
""" """
hour = TIME_RE.findall(h5.text) hour = self.row.xpath(".//span[contains(@class, 'date-actual__time')]")
if len(hour) == 1: if len(hour) != 1:
self.real_arrival = hour[0] logger.debug("Cannot parse actual time")
return
self.real_arrival = hour[0].text
def maybe_parse_code(self, h5: et._ElementTree) -> None: def maybe_parse_code(self) -> None:
""" """
This function fills the fileds related to the flight code, This function fills the fileds related to the flight code,
if present and the input matches some heuristics. if present and the input matches some heuristics.
""" """
if "flight-numb" not in h5.attrib.get("class", ""): code = self.row.xpath(".//td[contains(@class, 'lfr-departure-column-column')]//div/a/strong")
if len(code) != 1:
logger.debug("Cannot parse code")
return return
child = h5.xpath(".//strong") self.code = code[0].text.strip("\t\n ").replace(" ", "")
if len(child) == 1:
self.code = child[0].text.strip("\t\n ").replace(" ", "")
def maybe_parse_airport(self, h5: et._ElementTree) -> None: def maybe_parse_airport(self) -> None:
""" """
This function fills the field for the airport, if the input matches some This function fills the field for the airport, if the input matches some
heuristics. heuristics.
""" """
airport = h5.text.strip("\t\n") airport = self.row.xpath(".//td[contains(@class, 'lfr-flight-departure-column')]/h5")
if len(airport) > 0 and "blue-title" in h5.attrib.get("class", ""): if len(airport) != 1:
self.origin = airport logger.debug("Cannot parse airport")
return
self.origin = airport[0].text.strip("\t\n")
def maybe_parse_status(self, h5: et._ElementTree) -> None: def maybe_parse_status(self) -> None:
""" """
This function fills the filed for the status, if the input matches some This function fills the filed for the status, if the input matches some
heuristics. heuristics.
""" """
_class = h5.attrib.get("class", "") status = self.row.xpath(".//td[contains(@class, 'lfr-flight-status-column')]/h5")
if len(status) != 1:
logger.debug("Cannot parse status")
return
_class = status[0].attrib.get("class", "")
if "arrivato" in _class or "schedulato" in _class: if "arrivato" in _class or "schedulato" in _class:
self.status = Status.from_str(h5.text) self.status = Status.from_str(status[0].text)
else: else:
parsed = STATUS_RE.findall(h5.text) parsed = STATUS_RE.findall(status[0].text)
if len(parsed) == 1: if len(parsed) == 1:
self.status = Status.from_str(parsed[0]) self.status = Status.from_str(parsed[0])
@ -133,6 +148,7 @@ class Details(object):
only FlightRadar24 data). only FlightRadar24 data).
""" """
if not self.code: if not self.code:
logger.debug("Cannot add aux data: missing code")
return return
self.fr24_landing_time = aux_data.get(self.code) self.fr24_landing_time = aux_data.get(self.code)
@ -156,34 +172,18 @@ class Details(object):
def get_details( def get_details(
table_entry: et._ElementTree, table_entry: et._ElementTree,
aux_data: T.Optional[T.Dict[T.Text, T.Text]] = None, aux_data: T.Optional[T.Dict[T.Text, T.Text]] = None,
debug: bool = False,
) -> Details: ) -> Details:
""" """
Find the dates in a table row. If a strikenthrough time is found, it is Find the dates in a table row. If a strikenthrough time is found, it is
returned as second element in the tuple. returned as second element in the tuple.
""" """
res = table_entry.xpath(".//h5") d = Details(table_entry)
if len(res) > 6:
raise ValueError(f"Unexpected number of h5 found in line: {len(res)}")
if debug: d.maybe_parse_hour_th()
for r in res: d.maybe_parse_hour_real()
txt = r.text.strip("\t\n ") d.maybe_parse_code()
print(f"[DEBUG] text={txt} attrs={r.attrib}") d.maybe_parse_airport()
d.maybe_parse_status()
d = Details()
if len(res) == 5:
d.maybe_parse_hour_th(res[0])
d.maybe_parse_code(res[1])
d.maybe_parse_airport(res[2])
d.maybe_parse_status(res[3])
elif len(res) == 6:
d.maybe_parse_hour_th(res[0])
d.maybe_parse_hour_real(res[1])
d.maybe_parse_code(res[2])
d.maybe_parse_airport(res[3])
d.maybe_parse_status(res[4])
if aux_data: if aux_data:
d.maybe_add_aux_data(aux_data) d.maybe_add_aux_data(aux_data)
@ -192,7 +192,7 @@ def get_details(
def parse_fr24( def parse_fr24(
data: T.Optional[T.Dict[T.Text, T.Any]] data: T.Optional[T.Dict[T.Text, T.Any]],
) -> T.Optional[T.Dict[T.Text, T.Text]]: ) -> T.Optional[T.Dict[T.Text, T.Text]]:
""" """
This function parses the given FlightRadar24 data into a pandas DataFrame. This function parses the given FlightRadar24 data into a pandas DataFrame.
@ -206,9 +206,7 @@ def parse_fr24(
results = {} results = {}
for flight in data["result"]["response"]["airport"]["pluginData"]["schedule"][ for flight in data["result"]["response"]["airport"]["pluginData"]["schedule"][
"arrivals" "arrivals"
][ ]["data"]: # noqa: E501
"data"
]: # noqa: E501
try: try:
id_num = flight["flight"]["identification"]["number"] id_num = flight["flight"]["identification"]["number"]
if _code := id_num.get("default"): if _code := id_num.get("default"):