Fix parsing
This commit is contained in:
parent
fa0023d2d1
commit
31f2636dd8
|
@ -20,8 +20,8 @@ logger = logging.getLogger(__name__)
|
|||
|
||||
def not_empty(obj: et._Element) -> bool:
|
||||
if type(obj) is et._Element:
|
||||
children = len(obj.xpath(".//h5"))
|
||||
return children == 5 or children == 6
|
||||
children = len(obj.xpath(".//td"))
|
||||
return children in (5, 6)
|
||||
|
||||
raise RuntimeError(f"provided argument is of unsupported type: {type(obj)}")
|
||||
|
||||
|
@ -32,14 +32,18 @@ def find_table(html_content: T.Text) -> T.List[et._ElementTree]:
|
|||
Find the table that holds the data in the html response
|
||||
"""
|
||||
root = et.fromstring(html_content, parser=PARSER)
|
||||
tbody = root.xpath("//tbody[contains(@class, 'table-data')]")
|
||||
tbody = root.xpath("//tbody")
|
||||
if not tbody:
|
||||
return []
|
||||
|
||||
if len(tbody) != 1:
|
||||
raise ValueError(f"Unexpected parsing result: found {len(tbody)} results")
|
||||
|
||||
result = [child for child in tbody[0].xpath(".//tr") if not_empty(child)]
|
||||
result = [
|
||||
child
|
||||
for child in tbody[0].xpath(".//tr[contains(@data-qa-id, 'row')]")
|
||||
if not_empty(child)
|
||||
]
|
||||
|
||||
return result
|
||||
|
||||
|
@ -74,56 +78,67 @@ class Details(object):
|
|||
status: Status = Status.UNKNOWN
|
||||
fr24_landing_time: T.Optional[T.Text] = None
|
||||
|
||||
def maybe_parse_hour_th(self, h5: et._ElementTree) -> None:
|
||||
def __init__(self, row: et._ElementTree) -> None:
|
||||
self.row = row
|
||||
|
||||
def maybe_parse_hour_th(self) -> None:
|
||||
"""
|
||||
This function fills the fileds related to the theoric arrival hour,
|
||||
if the input matches some heuristics.
|
||||
"""
|
||||
hour = TIME_RE.findall(h5.text)
|
||||
if len(hour) == 1:
|
||||
self.th_arrival = hour[0]
|
||||
if "text-decoration: line-through" in h5.attrib.get("style", ""):
|
||||
self.real_arrival = None
|
||||
hour = self.row.xpath(".//span[contains(@class, 'date-estimated__time')]")
|
||||
if len(hour) != 1:
|
||||
logger.debug("Cannot parse estimated time")
|
||||
return
|
||||
self.th_arrival = hour[0].text
|
||||
|
||||
def maybe_parse_hour_real(self, h5: et._ElementTree) -> None:
|
||||
def maybe_parse_hour_real(self) -> None:
|
||||
"""
|
||||
This function fills the fileds related to the theoric arrival hour,
|
||||
if the input matches some heuristics.
|
||||
"""
|
||||
hour = TIME_RE.findall(h5.text)
|
||||
if len(hour) == 1:
|
||||
self.real_arrival = hour[0]
|
||||
hour = self.row.xpath(".//span[contains(@class, 'date-actual__time')]")
|
||||
if len(hour) != 1:
|
||||
logger.debug("Cannot parse actual time")
|
||||
return
|
||||
self.real_arrival = hour[0].text
|
||||
|
||||
def maybe_parse_code(self, h5: et._ElementTree) -> None:
|
||||
def maybe_parse_code(self) -> None:
|
||||
"""
|
||||
This function fills the fileds related to the flight code,
|
||||
if present and the input matches some heuristics.
|
||||
"""
|
||||
if "flight-numb" not in h5.attrib.get("class", ""):
|
||||
code = self.row.xpath(".//td[contains(@class, 'lfr-departure-column-column')]//div/a/strong")
|
||||
if len(code) != 1:
|
||||
logger.debug("Cannot parse code")
|
||||
return
|
||||
child = h5.xpath(".//strong")
|
||||
if len(child) == 1:
|
||||
self.code = child[0].text.strip("\t\n ").replace(" ", "")
|
||||
self.code = code[0].text.strip("\t\n ").replace(" ", "")
|
||||
|
||||
def maybe_parse_airport(self, h5: et._ElementTree) -> None:
|
||||
def maybe_parse_airport(self) -> None:
|
||||
"""
|
||||
This function fills the field for the airport, if the input matches some
|
||||
heuristics.
|
||||
"""
|
||||
airport = h5.text.strip("\t\n")
|
||||
if len(airport) > 0 and "blue-title" in h5.attrib.get("class", ""):
|
||||
self.origin = airport
|
||||
airport = self.row.xpath(".//td[contains(@class, 'lfr-flight-departure-column')]/h5")
|
||||
if len(airport) != 1:
|
||||
logger.debug("Cannot parse airport")
|
||||
return
|
||||
self.origin = airport[0].text.strip("\t\n")
|
||||
|
||||
def maybe_parse_status(self, h5: et._ElementTree) -> None:
|
||||
def maybe_parse_status(self) -> None:
|
||||
"""
|
||||
This function fills the filed for the status, if the input matches some
|
||||
heuristics.
|
||||
"""
|
||||
_class = h5.attrib.get("class", "")
|
||||
status = self.row.xpath(".//td[contains(@class, 'lfr-flight-status-column')]/h5")
|
||||
if len(status) != 1:
|
||||
logger.debug("Cannot parse status")
|
||||
return
|
||||
_class = status[0].attrib.get("class", "")
|
||||
if "arrivato" in _class or "schedulato" in _class:
|
||||
self.status = Status.from_str(h5.text)
|
||||
self.status = Status.from_str(status[0].text)
|
||||
else:
|
||||
parsed = STATUS_RE.findall(h5.text)
|
||||
parsed = STATUS_RE.findall(status[0].text)
|
||||
if len(parsed) == 1:
|
||||
self.status = Status.from_str(parsed[0])
|
||||
|
||||
|
@ -133,6 +148,7 @@ class Details(object):
|
|||
only FlightRadar24 data).
|
||||
"""
|
||||
if not self.code:
|
||||
logger.debug("Cannot add aux data: missing code")
|
||||
return
|
||||
|
||||
self.fr24_landing_time = aux_data.get(self.code)
|
||||
|
@ -156,34 +172,18 @@ class Details(object):
|
|||
def get_details(
|
||||
table_entry: et._ElementTree,
|
||||
aux_data: T.Optional[T.Dict[T.Text, T.Text]] = None,
|
||||
debug: bool = False,
|
||||
) -> Details:
|
||||
"""
|
||||
Find the dates in a table row. If a strikenthrough time is found, it is
|
||||
returned as second element in the tuple.
|
||||
"""
|
||||
res = table_entry.xpath(".//h5")
|
||||
if len(res) > 6:
|
||||
raise ValueError(f"Unexpected number of h5 found in line: {len(res)}")
|
||||
d = Details(table_entry)
|
||||
|
||||
if debug:
|
||||
for r in res:
|
||||
txt = r.text.strip("\t\n ")
|
||||
print(f"[DEBUG] text={txt} attrs={r.attrib}")
|
||||
|
||||
d = Details()
|
||||
|
||||
if len(res) == 5:
|
||||
d.maybe_parse_hour_th(res[0])
|
||||
d.maybe_parse_code(res[1])
|
||||
d.maybe_parse_airport(res[2])
|
||||
d.maybe_parse_status(res[3])
|
||||
elif len(res) == 6:
|
||||
d.maybe_parse_hour_th(res[0])
|
||||
d.maybe_parse_hour_real(res[1])
|
||||
d.maybe_parse_code(res[2])
|
||||
d.maybe_parse_airport(res[3])
|
||||
d.maybe_parse_status(res[4])
|
||||
d.maybe_parse_hour_th()
|
||||
d.maybe_parse_hour_real()
|
||||
d.maybe_parse_code()
|
||||
d.maybe_parse_airport()
|
||||
d.maybe_parse_status()
|
||||
|
||||
if aux_data:
|
||||
d.maybe_add_aux_data(aux_data)
|
||||
|
@ -192,7 +192,7 @@ def get_details(
|
|||
|
||||
|
||||
def parse_fr24(
|
||||
data: T.Optional[T.Dict[T.Text, T.Any]]
|
||||
data: T.Optional[T.Dict[T.Text, T.Any]],
|
||||
) -> T.Optional[T.Dict[T.Text, T.Text]]:
|
||||
"""
|
||||
This function parses the given FlightRadar24 data into a pandas DataFrame.
|
||||
|
@ -206,9 +206,7 @@ def parse_fr24(
|
|||
results = {}
|
||||
for flight in data["result"]["response"]["airport"]["pluginData"]["schedule"][
|
||||
"arrivals"
|
||||
][
|
||||
"data"
|
||||
]: # noqa: E501
|
||||
]["data"]: # noqa: E501
|
||||
try:
|
||||
id_num = flight["flight"]["identification"]["number"]
|
||||
if _code := id_num.get("default"):
|
||||
|
|
Loading…
Reference in New Issue
Block a user