latecomers/latecomers/parse.py

253 lines
7.5 KiB
Python
Raw Normal View History

2022-08-18 18:49:47 +02:00
# -*- encoding: utf-8 -*-
2022-08-24 12:43:32 +02:00
from dataclasses import dataclass
2022-09-07 23:44:53 +02:00
from datetime import datetime
2022-08-18 18:49:47 +02:00
from enum import Enum
2022-08-25 13:11:33 +02:00
import logging
2022-08-18 18:49:47 +02:00
import re
import typing as T
2022-08-25 13:11:33 +02:00
from latecomers.helpers import logit
2022-08-18 18:49:47 +02:00
from lxml import etree as et
TIME_RE = re.compile(r"\d\d?:\d\d")
AIRPORT_RE = re.compile(r"[\w\d\s\S]+")
STATUS_RE = re.compile(r"(Arrivato|In Arrivo|Schedulato|Cancellato)")
PARSER = et.HTMLParser()
2022-08-25 13:11:33 +02:00
logger = logging.getLogger(__name__)
2022-08-18 18:49:47 +02:00
def not_empty(obj: et._Element) -> bool:
if type(obj) is et._Element:
2024-05-06 01:10:23 +02:00
if "lfr-template" in obj.attrib.get("class"):
return False
2024-02-17 16:33:34 +01:00
children = len(obj.xpath(".//td"))
return children in (5, 6)
2022-08-18 18:49:47 +02:00
raise RuntimeError(f"provided argument is of unsupported type: {type(obj)}")
2024-05-06 01:10:43 +02:00
@logit(logger)
def count_pages(html_content: T.Text) -> int:
"""
Count how many pages there are to be accessed
"""
root = et.fromstring(html_content, parser=PARSER)
il_items = root.xpath(
"//div[contains(@data-qa-id, 'paginator')]/ul[contains(@class, 'pagination')]/li"
)
if not il_items:
return 1
return len(il_items) - 2
2022-08-25 13:11:33 +02:00
@logit(logger)
2022-08-18 18:49:47 +02:00
def find_table(html_content: T.Text) -> T.List[et._ElementTree]:
"""
Find the table that holds the data in the html response
"""
root = et.fromstring(html_content, parser=PARSER)
2024-02-17 16:33:34 +01:00
tbody = root.xpath("//tbody")
2022-08-18 18:49:47 +02:00
if not tbody:
return []
if len(tbody) != 1:
raise ValueError(f"Unexpected parsing result: found {len(tbody)} results")
2024-02-17 16:33:34 +01:00
result = [
child
for child in tbody[0].xpath(".//tr[contains(@data-qa-id, 'row')]")
if not_empty(child)
]
2022-08-18 18:49:47 +02:00
return result
class Status(Enum):
ARRIVED = "Arrivato"
ARRIVING = "In Arrivo"
SCHEDULED = "Schedulato"
CANCELED = "Cancellato"
UNKNOWN = "Sconosciuto"
@classmethod
def from_str(cls, text: T.Text) -> "Status":
if "Arrivato" in text:
return cls.ARRIVED
elif "In Arrivo" in text:
return cls.ARRIVING
elif "Schedulato" in text:
return cls.SCHEDULED
elif "Cancellato" in text:
return cls.CANCELED
else:
return cls.UNKNOWN
2022-08-24 12:43:32 +02:00
@dataclass
2022-08-18 18:49:47 +02:00
class Details(object):
th_arrival: T.Optional[T.Text] = None
real_arrival: T.Optional[T.Text] = None
code: T.Optional[T.Text] = None
origin: T.Optional[T.Text] = None
status: Status = Status.UNKNOWN
2022-09-07 23:44:53 +02:00
fr24_landing_time: T.Optional[T.Text] = None
2022-08-18 18:49:47 +02:00
2024-02-17 16:33:34 +01:00
def __init__(self, row: et._ElementTree) -> None:
self.row = row
def maybe_parse_hour_th(self) -> None:
2022-08-18 18:49:47 +02:00
"""
2022-08-25 21:35:25 +02:00
This function fills the fileds related to the theoric arrival hour,
2022-08-18 18:49:47 +02:00
if the input matches some heuristics.
"""
2024-02-17 16:33:34 +01:00
hour = self.row.xpath(".//span[contains(@class, 'date-estimated__time')]")
if len(hour) != 1:
logger.debug("Cannot parse estimated time")
return
self.th_arrival = hour[0].text
2022-08-25 21:35:25 +02:00
2024-02-17 16:33:34 +01:00
def maybe_parse_hour_real(self) -> None:
2022-08-25 21:35:25 +02:00
"""
This function fills the fileds related to the theoric arrival hour,
if the input matches some heuristics.
"""
2024-02-17 16:33:34 +01:00
hour = self.row.xpath(".//span[contains(@class, 'date-actual__time')]")
if len(hour) != 1:
logger.debug("Cannot parse actual time")
return
self.real_arrival = hour[0].text
2022-08-18 18:49:47 +02:00
2024-02-17 16:33:34 +01:00
def maybe_parse_code(self) -> None:
2022-08-18 18:49:47 +02:00
"""
This function fills the fileds related to the flight code,
if present and the input matches some heuristics.
"""
2024-05-06 01:10:43 +02:00
code = self.row.xpath(
".//td[contains(@class, 'lfr-departure-column-column')]//div/a/strong"
)
2024-02-17 16:33:34 +01:00
if len(code) != 1:
logger.debug("Cannot parse code")
2022-09-07 22:17:22 +02:00
return
2024-02-17 16:33:34 +01:00
self.code = code[0].text.strip("\t\n ").replace(" ", "")
2022-08-18 18:49:47 +02:00
2024-02-17 16:33:34 +01:00
def maybe_parse_airport(self) -> None:
2022-08-18 18:49:47 +02:00
"""
This function fills the field for the airport, if the input matches some
heuristics.
"""
2024-05-06 01:10:43 +02:00
airport = self.row.xpath(
".//td[contains(@class, 'lfr-flight-departure-column')]/h5"
)
2024-02-17 16:33:34 +01:00
if len(airport) != 1:
logger.debug("Cannot parse airport")
return
self.origin = airport[0].text.strip("\t\n")
2022-08-18 18:49:47 +02:00
2024-02-17 16:33:34 +01:00
def maybe_parse_status(self) -> None:
2022-08-18 18:49:47 +02:00
"""
This function fills the filed for the status, if the input matches some
heuristics.
"""
2024-05-06 01:10:43 +02:00
status = self.row.xpath(
".//td[contains(@class, 'lfr-flight-status-column')]/h5"
)
2024-02-17 16:33:34 +01:00
if len(status) != 1:
logger.debug("Cannot parse status")
return
_class = status[0].attrib.get("class", "")
2022-08-18 18:49:47 +02:00
if "arrivato" in _class or "schedulato" in _class:
2024-02-17 16:33:34 +01:00
self.status = Status.from_str(status[0].text)
2022-08-18 18:49:47 +02:00
else:
2024-02-17 16:33:34 +01:00
parsed = STATUS_RE.findall(status[0].text)
2022-08-18 18:49:47 +02:00
if len(parsed) == 1:
self.status = Status.from_str(parsed[0])
2022-09-07 23:44:53 +02:00
def maybe_add_aux_data(self, aux_data: T.Dict[T.Text, T.Text]) -> None:
"""
This function extends the current data with auxiliary sources (currently
only FlightRadar24 data).
"""
if not self.code:
2024-02-17 16:33:34 +01:00
logger.debug("Cannot add aux data: missing code")
2022-09-07 23:44:53 +02:00
return
self.fr24_landing_time = aux_data.get(self.code)
2022-08-18 18:49:47 +02:00
def __str__(self) -> T.Text:
res: T.Dict[T.Text, T.Optional[T.Text]] = {}
if self.th_arrival:
res["theoric"] = self.th_arrival
2022-09-07 23:44:53 +02:00
if self.real_arrival:
res["real"] = self.real_arrival
2022-08-18 18:49:47 +02:00
if self.code:
res["code"] = self.code
res["origin"] = self.origin
res["status"] = self.status.value
2022-09-07 23:44:53 +02:00
if self.fr24_landing_time:
res["fr24_landing_time"] = self.fr24_landing_time
2022-08-18 18:49:47 +02:00
desc = ",".join([f"{k}={v}" for k, v in res.items()])
return f"Detail<{desc}>"
2022-09-07 23:44:53 +02:00
def get_details(
table_entry: et._ElementTree,
aux_data: T.Optional[T.Dict[T.Text, T.Text]] = None,
) -> Details:
2022-08-18 18:49:47 +02:00
"""
Find the dates in a table row. If a strikenthrough time is found, it is
returned as second element in the tuple.
"""
2024-02-17 16:33:34 +01:00
d = Details(table_entry)
d.maybe_parse_hour_th()
d.maybe_parse_hour_real()
d.maybe_parse_code()
d.maybe_parse_airport()
d.maybe_parse_status()
2022-08-18 18:49:47 +02:00
2022-09-07 23:44:53 +02:00
if aux_data:
d.maybe_add_aux_data(aux_data)
2022-08-18 18:49:47 +02:00
return d
2022-09-07 23:44:53 +02:00
def parse_fr24(
2024-02-17 16:33:34 +01:00
data: T.Optional[T.Dict[T.Text, T.Any]],
2022-09-07 23:44:53 +02:00
) -> T.Optional[T.Dict[T.Text, T.Text]]:
"""
This function parses the given FlightRadar24 data into a pandas DataFrame.
"""
2022-09-09 11:08:37 +02:00
logger.debug("fr24 raw data: %s", data)
2022-09-07 23:44:53 +02:00
if not data:
return None
try:
results = {}
for flight in data["result"]["response"]["airport"]["pluginData"]["schedule"][
"arrivals"
2024-02-17 16:33:34 +01:00
]["data"]: # noqa: E501
2022-09-07 23:44:53 +02:00
try:
id_num = flight["flight"]["identification"]["number"]
2022-09-18 23:29:49 +02:00
if _code := id_num.get("default"):
2022-09-07 23:44:53 +02:00
code = _code
2022-09-18 23:29:49 +02:00
elif _code := id_num.get("alternative"):
2022-09-07 23:44:53 +02:00
code = _code
else:
# skip if no flight code found
continue
ts = flight["flight"]["time"]["real"]["arrival"]
real_arrival = datetime.fromtimestamp(ts).strftime("%H:%M")
results[code] = real_arrival
2022-09-09 11:08:37 +02:00
logger.debug(f"{code} -> {real_arrival}")
2022-09-07 23:44:53 +02:00
except: # noqa: E722
continue
return results
except: # noqa: E722
return None