Fix parsing

master
blallo 2024-02-17 16:33:34 +01:00
parent fa0023d2d1
commit 31f2636dd8
Signed by: blallo
GPG Key ID: C530464EEDCF489A
1 changed files with 51 additions and 53 deletions

View File

@ -20,8 +20,8 @@ logger = logging.getLogger(__name__)
def not_empty(obj: et._Element) -> bool:
if type(obj) is et._Element:
children = len(obj.xpath(".//h5"))
return children == 5 or children == 6
children = len(obj.xpath(".//td"))
return children in (5, 6)
raise RuntimeError(f"provided argument is of unsupported type: {type(obj)}")
@ -32,14 +32,18 @@ def find_table(html_content: T.Text) -> T.List[et._ElementTree]:
Find the table that holds the data in the html response
"""
root = et.fromstring(html_content, parser=PARSER)
tbody = root.xpath("//tbody[contains(@class, 'table-data')]")
tbody = root.xpath("//tbody")
if not tbody:
return []
if len(tbody) != 1:
raise ValueError(f"Unexpected parsing result: found {len(tbody)} results")
result = [child for child in tbody[0].xpath(".//tr") if not_empty(child)]
result = [
child
for child in tbody[0].xpath(".//tr[contains(@data-qa-id, 'row')]")
if not_empty(child)
]
return result
@ -74,56 +78,67 @@ class Details(object):
status: Status = Status.UNKNOWN
fr24_landing_time: T.Optional[T.Text] = None
def maybe_parse_hour_th(self, h5: et._ElementTree) -> None:
def __init__(self, row: et._ElementTree) -> None:
self.row = row
def maybe_parse_hour_th(self) -> None:
"""
This function fills the fileds related to the theoric arrival hour,
if the input matches some heuristics.
"""
hour = TIME_RE.findall(h5.text)
if len(hour) == 1:
self.th_arrival = hour[0]
if "text-decoration: line-through" in h5.attrib.get("style", ""):
self.real_arrival = None
hour = self.row.xpath(".//span[contains(@class, 'date-estimated__time')]")
if len(hour) != 1:
logger.debug("Cannot parse estimated time")
return
self.th_arrival = hour[0].text
def maybe_parse_hour_real(self, h5: et._ElementTree) -> None:
def maybe_parse_hour_real(self) -> None:
"""
This function fills the fileds related to the theoric arrival hour,
if the input matches some heuristics.
"""
hour = TIME_RE.findall(h5.text)
if len(hour) == 1:
self.real_arrival = hour[0]
hour = self.row.xpath(".//span[contains(@class, 'date-actual__time')]")
if len(hour) != 1:
logger.debug("Cannot parse actual time")
return
self.real_arrival = hour[0].text
def maybe_parse_code(self, h5: et._ElementTree) -> None:
def maybe_parse_code(self) -> None:
"""
This function fills the fileds related to the flight code,
if present and the input matches some heuristics.
"""
if "flight-numb" not in h5.attrib.get("class", ""):
code = self.row.xpath(".//td[contains(@class, 'lfr-departure-column-column')]//div/a/strong")
if len(code) != 1:
logger.debug("Cannot parse code")
return
child = h5.xpath(".//strong")
if len(child) == 1:
self.code = child[0].text.strip("\t\n ").replace(" ", "")
self.code = code[0].text.strip("\t\n ").replace(" ", "")
def maybe_parse_airport(self, h5: et._ElementTree) -> None:
def maybe_parse_airport(self) -> None:
"""
This function fills the field for the airport, if the input matches some
heuristics.
"""
airport = h5.text.strip("\t\n")
if len(airport) > 0 and "blue-title" in h5.attrib.get("class", ""):
self.origin = airport
airport = self.row.xpath(".//td[contains(@class, 'lfr-flight-departure-column')]/h5")
if len(airport) != 1:
logger.debug("Cannot parse airport")
return
self.origin = airport[0].text.strip("\t\n")
def maybe_parse_status(self, h5: et._ElementTree) -> None:
def maybe_parse_status(self) -> None:
"""
This function fills the filed for the status, if the input matches some
heuristics.
"""
_class = h5.attrib.get("class", "")
status = self.row.xpath(".//td[contains(@class, 'lfr-flight-status-column')]/h5")
if len(status) != 1:
logger.debug("Cannot parse status")
return
_class = status[0].attrib.get("class", "")
if "arrivato" in _class or "schedulato" in _class:
self.status = Status.from_str(h5.text)
self.status = Status.from_str(status[0].text)
else:
parsed = STATUS_RE.findall(h5.text)
parsed = STATUS_RE.findall(status[0].text)
if len(parsed) == 1:
self.status = Status.from_str(parsed[0])
@ -133,6 +148,7 @@ class Details(object):
only FlightRadar24 data).
"""
if not self.code:
logger.debug("Cannot add aux data: missing code")
return
self.fr24_landing_time = aux_data.get(self.code)
@ -156,34 +172,18 @@ class Details(object):
def get_details(
table_entry: et._ElementTree,
aux_data: T.Optional[T.Dict[T.Text, T.Text]] = None,
debug: bool = False,
) -> Details:
"""
Find the dates in a table row. If a strikenthrough time is found, it is
returned as second element in the tuple.
"""
res = table_entry.xpath(".//h5")
if len(res) > 6:
raise ValueError(f"Unexpected number of h5 found in line: {len(res)}")
d = Details(table_entry)
if debug:
for r in res:
txt = r.text.strip("\t\n ")
print(f"[DEBUG] text={txt} attrs={r.attrib}")
d = Details()
if len(res) == 5:
d.maybe_parse_hour_th(res[0])
d.maybe_parse_code(res[1])
d.maybe_parse_airport(res[2])
d.maybe_parse_status(res[3])
elif len(res) == 6:
d.maybe_parse_hour_th(res[0])
d.maybe_parse_hour_real(res[1])
d.maybe_parse_code(res[2])
d.maybe_parse_airport(res[3])
d.maybe_parse_status(res[4])
d.maybe_parse_hour_th()
d.maybe_parse_hour_real()
d.maybe_parse_code()
d.maybe_parse_airport()
d.maybe_parse_status()
if aux_data:
d.maybe_add_aux_data(aux_data)
@ -192,7 +192,7 @@ def get_details(
def parse_fr24(
data: T.Optional[T.Dict[T.Text, T.Any]]
data: T.Optional[T.Dict[T.Text, T.Any]],
) -> T.Optional[T.Dict[T.Text, T.Text]]:
"""
This function parses the given FlightRadar24 data into a pandas DataFrame.
@ -206,9 +206,7 @@ def parse_fr24(
results = {}
for flight in data["result"]["response"]["airport"]["pluginData"]["schedule"][
"arrivals"
][
"data"
]: # noqa: E501
]["data"]: # noqa: E501
try:
id_num = flight["flight"]["identification"]["number"]
if _code := id_num.get("default"):