Compare commits

..

No commits in common. "master" and "v0.3.1" have entirely different histories.

11 changed files with 80 additions and 20000 deletions

View File

@ -1,10 +1,12 @@
# -*- encoding: utf-8 -*- # -*- encoding: utf-8 -*-
import os
from latecomers.parse import find_table, get_details from latecomers.parse import find_table, get_details
with open("./devloop/sample3.html") as f: with open("./devloop/sample.html") as f:
content = f.read() content = f.read()
flights = find_table(content) flights = find_table(content)
for f in flights: for f in flights:
print(get_details(f)) print(get_details(f, os.environ.get("DEBUG") is not None))

View File

@ -2,17 +2,14 @@
import os import os
from latecomers.retrieve import retrieve_from_inst, retrieve_from_fr24 from latecomers.retrieve import retrieve_from_inst, retrieve_from_fr24
from latecomers.parse import count_pages, find_table, get_details, parse_fr24 from latecomers.parse import find_table, get_details, parse_fr24
breakpoint()
aux_data = parse_fr24(retrieve_from_fr24())
body = retrieve_from_inst() body = retrieve_from_inst()
pages = count_pages(body)
flights = find_table(body)
for page in range(2, pages + 1):
body = retrieve_from_inst(page)
flights.extend(find_table(body))
flights = find_table(body)
aux_data = parse_fr24(retrieve_from_fr24())
breakpoint()
for f in flights: for f in flights:
print(get_details(f, aux_data=aux_data)) print(get_details(f, aux_data=aux_data, debug=os.environ.get("DEBUG") is not None))

File diff suppressed because one or more lines are too long

View File

@ -1,25 +1,22 @@
# -*- encoding: utf-8 -*- # -*- encoding: utf-8 -*-
import os
import pandas as pd import pandas as pd
from latecomers.parse import find_table, get_details from latecomers.parse import find_table, get_details
from latecomers.serializer import to_excel from latecomers.serializer import to_excel
with open("./devloop/sample3.html") as f: with open("./devloop/sample.html") as f:
content = f.read() content = f.read()
flights = find_table(content) flights = find_table(content)
data = [] data = []
for f in flights: for f in flights:
data.append(get_details(f)) data.append(get_details(f, os.environ.get("DEBUG") is not None))
print(to_excel(data)) print(to_excel(data))
colonne = { colonne = {"th_arrival": "Arrivo teorico", "real_arrival": "Arrivo reale",
"th_arrival": "Arrivo teorico", "code": "Codice volo", "origin": "Aeroporto di partenza", "status": "Stato"}
"real_arrival": "Arrivo reale",
"code": "Codice volo",
"origin": "Aeroporto di partenza",
"status": "Stato",
}
df = pd.DataFrame(data, columns=colonne) df = pd.DataFrame(data, columns=colonne)

View File

@ -35,9 +35,7 @@ class Config(object):
content = yaml.safe_load(f) content = yaml.safe_load(f)
self.to = get_section(content, "to") self.to = get_section(content, "to")
self.cc = get_section(content, "cc")
if cc := content.get("cc"):
self.cc = cc
smtp = get_section(content, "smtp") smtp = get_section(content, "smtp")
for key in [ for key in [

View File

@ -4,7 +4,7 @@ import sys
import typing as T import typing as T
from latecomers.retrieve import retrieve_from_inst, retrieve_from_fr24 from latecomers.retrieve import retrieve_from_inst, retrieve_from_fr24
from latecomers.parse import count_pages, find_table, get_details, Details, parse_fr24 from latecomers.parse import find_table, get_details, Details, parse_fr24
from latecomers.serializer import to_excel from latecomers.serializer import to_excel
from latecomers.notifier import Notifier from latecomers.notifier import Notifier
from latecomers.config import Config from latecomers.config import Config
@ -23,19 +23,11 @@ def main(config: Config):
The main cli entrypoint. The main cli entrypoint.
""" """
out = Notifier(**config.smtp) out = Notifier(**config.smtp)
body = retrieve_from_inst() body = retrieve_from_inst()
pages = count_pages(body)
logger.info(f"found all data in {pages} page(s)")
table = find_table(body) table = find_table(body)
for page in range(2, pages + 1):
body = retrieve_from_inst(page)
table.extend(find_table(body))
fr24_data = retrieve_from_fr24() fr24_data = retrieve_from_fr24()
aux_data = parse_fr24(fr24_data) aux_data = parse_fr24(fr24_data)
data: T.List[Details] = [] data: T.List[Details] = []
for row in table: for row in table:
data.append(get_details(row, aux_data)) data.append(get_details(row, aux_data))

View File

@ -1,6 +1,5 @@
# -*- encoding: utf-8 -*- # -*- encoding: utf-8 -*-
from contextlib import contextmanager from contextlib import contextmanager
from datetime import datetime
from email import encoders from email import encoders
from email.mime.base import MIMEBase from email.mime.base import MIMEBase
from email.mime.multipart import MIMEMultipart from email.mime.multipart import MIMEMultipart
@ -63,7 +62,6 @@ class Notifier(object):
body = f"Resoconto dei voli dal sito di AdR per l'aereoporto di Ciampino in data {date}" # noqa: E501 body = f"Resoconto dei voli dal sito di AdR per l'aereoporto di Ciampino in data {date}" # noqa: E501
message = MIMEMultipart() message = MIMEMultipart()
message["Date"] = datetime.now().strftime("%a, %d %b %Y %H:%M:%S %z")
message["From"] = self._from message["From"] = self._from
message["To"] = ",".join(to) message["To"] = ",".join(to)
if cc: if cc:
@ -84,10 +82,7 @@ class Notifier(object):
email = message.as_string() email = message.as_string()
rcpt = to self.send(to, email)
rcpt.extend(cc)
self.send(rcpt, email)
@retry_and_log(logger, RETRIES) @retry_and_log(logger, RETRIES)
def send_no_data(self, to: T.List[T.Text], cc: T.List[T.Text]) -> None: def send_no_data(self, to: T.List[T.Text], cc: T.List[T.Text]) -> None:
@ -100,7 +95,6 @@ il vostro tecnico preferito.
""" """
message = MIMEMultipart() message = MIMEMultipart()
message["Date"] = datetime.now().strftime("%a, %d %b %Y %H:%M:%S %z")
message["From"] = self._from message["From"] = self._from
message["To"] = ",".join(to) message["To"] = ",".join(to)
if cc: if cc:
@ -110,7 +104,4 @@ il vostro tecnico preferito.
message.attach(MIMEText(body, "plain")) message.attach(MIMEText(body, "plain"))
email = message.as_string() email = message.as_string()
rcpt = to self.send(to, email)
rcpt.extend(cc)
self.send(rcpt, email)

View File

@ -20,47 +20,26 @@ logger = logging.getLogger(__name__)
def not_empty(obj: et._Element) -> bool: def not_empty(obj: et._Element) -> bool:
if type(obj) is et._Element: if type(obj) is et._Element:
if "lfr-template" in obj.attrib.get("class"): children = len(obj.xpath(".//h5"))
return False return children == 5 or children == 6
children = len(obj.xpath(".//td"))
return children in (5, 6)
raise RuntimeError(f"provided argument is of unsupported type: {type(obj)}") raise RuntimeError(f"provided argument is of unsupported type: {type(obj)}")
@logit(logger)
def count_pages(html_content: T.Text) -> int:
"""
Count how many pages there are to be accessed
"""
root = et.fromstring(html_content, parser=PARSER)
il_items = root.xpath(
"//div[contains(@data-qa-id, 'paginator')]/ul[contains(@class, 'pagination')]/li"
)
if not il_items:
return 1
return len(il_items) - 2
@logit(logger) @logit(logger)
def find_table(html_content: T.Text) -> T.List[et._ElementTree]: def find_table(html_content: T.Text) -> T.List[et._ElementTree]:
""" """
Find the table that holds the data in the html response Find the table that holds the data in the html response
""" """
root = et.fromstring(html_content, parser=PARSER) root = et.fromstring(html_content, parser=PARSER)
tbody = root.xpath("//tbody") tbody = root.xpath("//tbody[contains(@class, 'table-data')]")
if not tbody: if not tbody:
return [] return []
if len(tbody) != 1: if len(tbody) != 1:
raise ValueError(f"Unexpected parsing result: found {len(tbody)} results") raise ValueError(f"Unexpected parsing result: found {len(tbody)} results")
result = [ result = [child for child in tbody[0].xpath(".//tr") if not_empty(child)]
child
for child in tbody[0].xpath(".//tr[contains(@data-qa-id, 'row')]")
if not_empty(child)
]
return result return result
@ -95,73 +74,56 @@ class Details(object):
status: Status = Status.UNKNOWN status: Status = Status.UNKNOWN
fr24_landing_time: T.Optional[T.Text] = None fr24_landing_time: T.Optional[T.Text] = None
def __init__(self, row: et._ElementTree) -> None: def maybe_parse_hour_th(self, h5: et._ElementTree) -> None:
self.row = row
def maybe_parse_hour_th(self) -> None:
""" """
This function fills the fileds related to the theoric arrival hour, This function fills the fileds related to the theoric arrival hour,
if the input matches some heuristics. if the input matches some heuristics.
""" """
hour = self.row.xpath(".//span[contains(@class, 'date-estimated__time')]") hour = TIME_RE.findall(h5.text)
if len(hour) != 1: if len(hour) == 1:
logger.debug("Cannot parse estimated time") self.th_arrival = hour[0]
return if "text-decoration: line-through" in h5.attrib.get("style", ""):
self.th_arrival = hour[0].text self.real_arrival = None
def maybe_parse_hour_real(self) -> None: def maybe_parse_hour_real(self, h5: et._ElementTree) -> None:
""" """
This function fills the fileds related to the theoric arrival hour, This function fills the fileds related to the theoric arrival hour,
if the input matches some heuristics. if the input matches some heuristics.
""" """
hour = self.row.xpath(".//span[contains(@class, 'date-actual__time')]") hour = TIME_RE.findall(h5.text)
if len(hour) != 1: if len(hour) == 1:
logger.debug("Cannot parse actual time") self.real_arrival = hour[0]
return
self.real_arrival = hour[0].text
def maybe_parse_code(self) -> None: def maybe_parse_code(self, h5: et._ElementTree) -> None:
""" """
This function fills the fileds related to the flight code, This function fills the fileds related to the flight code,
if present and the input matches some heuristics. if present and the input matches some heuristics.
""" """
code = self.row.xpath( if "flight-numb" not in h5.attrib.get("class", ""):
".//td[contains(@class, 'lfr-departure-column-column')]//div/a/strong"
)
if len(code) != 1:
logger.debug("Cannot parse code")
return return
self.code = code[0].text.strip("\t\n ").replace(" ", "") child = h5.xpath(".//strong")
if len(child) == 1:
self.code = child[0].text.strip("\t\n ").replace(" ", "")
def maybe_parse_airport(self) -> None: def maybe_parse_airport(self, h5: et._ElementTree) -> None:
""" """
This function fills the field for the airport, if the input matches some This function fills the field for the airport, if the input matches some
heuristics. heuristics.
""" """
airport = self.row.xpath( airport = h5.text.strip("\t\n")
".//td[contains(@class, 'lfr-flight-departure-column')]/h5" if len(airport) > 0 and "blue-title" in h5.attrib.get("class", ""):
) self.origin = airport
if len(airport) != 1:
logger.debug("Cannot parse airport")
return
self.origin = airport[0].text.strip("\t\n")
def maybe_parse_status(self) -> None: def maybe_parse_status(self, h5: et._ElementTree) -> None:
""" """
This function fills the filed for the status, if the input matches some This function fills the filed for the status, if the input matches some
heuristics. heuristics.
""" """
status = self.row.xpath( _class = h5.attrib.get("class", "")
".//td[contains(@class, 'lfr-flight-status-column')]/h5"
)
if len(status) != 1:
logger.debug("Cannot parse status")
return
_class = status[0].attrib.get("class", "")
if "arrivato" in _class or "schedulato" in _class: if "arrivato" in _class or "schedulato" in _class:
self.status = Status.from_str(status[0].text) self.status = Status.from_str(h5.text)
else: else:
parsed = STATUS_RE.findall(status[0].text) parsed = STATUS_RE.findall(h5.text)
if len(parsed) == 1: if len(parsed) == 1:
self.status = Status.from_str(parsed[0]) self.status = Status.from_str(parsed[0])
@ -171,7 +133,6 @@ class Details(object):
only FlightRadar24 data). only FlightRadar24 data).
""" """
if not self.code: if not self.code:
logger.debug("Cannot add aux data: missing code")
return return
self.fr24_landing_time = aux_data.get(self.code) self.fr24_landing_time = aux_data.get(self.code)
@ -195,18 +156,34 @@ class Details(object):
def get_details( def get_details(
table_entry: et._ElementTree, table_entry: et._ElementTree,
aux_data: T.Optional[T.Dict[T.Text, T.Text]] = None, aux_data: T.Optional[T.Dict[T.Text, T.Text]] = None,
debug: bool = False,
) -> Details: ) -> Details:
""" """
Find the dates in a table row. If a strikenthrough time is found, it is Find the dates in a table row. If a strikenthrough time is found, it is
returned as second element in the tuple. returned as second element in the tuple.
""" """
d = Details(table_entry) res = table_entry.xpath(".//h5")
if len(res) > 6:
raise ValueError(f"Unexpected number of h5 found in line: {len(res)}")
d.maybe_parse_hour_th() if debug:
d.maybe_parse_hour_real() for r in res:
d.maybe_parse_code() txt = r.text.strip("\t\n ")
d.maybe_parse_airport() print(f"[DEBUG] text={txt} attrs={r.attrib}")
d.maybe_parse_status()
d = Details()
if len(res) == 5:
d.maybe_parse_hour_th(res[0])
d.maybe_parse_code(res[1])
d.maybe_parse_airport(res[2])
d.maybe_parse_status(res[3])
elif len(res) == 6:
d.maybe_parse_hour_th(res[0])
d.maybe_parse_hour_real(res[1])
d.maybe_parse_code(res[2])
d.maybe_parse_airport(res[3])
d.maybe_parse_status(res[4])
if aux_data: if aux_data:
d.maybe_add_aux_data(aux_data) d.maybe_add_aux_data(aux_data)
@ -215,7 +192,7 @@ def get_details(
def parse_fr24( def parse_fr24(
data: T.Optional[T.Dict[T.Text, T.Any]], data: T.Optional[T.Dict[T.Text, T.Any]]
) -> T.Optional[T.Dict[T.Text, T.Text]]: ) -> T.Optional[T.Dict[T.Text, T.Text]]:
""" """
This function parses the given FlightRadar24 data into a pandas DataFrame. This function parses the given FlightRadar24 data into a pandas DataFrame.
@ -229,7 +206,9 @@ def parse_fr24(
results = {} results = {}
for flight in data["result"]["response"]["airport"]["pluginData"]["schedule"][ for flight in data["result"]["response"]["airport"]["pluginData"]["schedule"][
"arrivals" "arrivals"
]["data"]: # noqa: E501 ][
"data"
]: # noqa: E501
try: try:
id_num = flight["flight"]["identification"]["number"] id_num = flight["flight"]["identification"]["number"]
if _code := id_num.get("default"): if _code := id_num.get("default"):

File diff suppressed because one or more lines are too long

View File

@ -1,6 +1,6 @@
[project] [project]
name = "latecomers" name = "latecomers"
version = "0.5.0" version = "0.3.1"
description = "Retrieve and save data from ADR Ciampino airport" description = "Retrieve and save data from ADR Ciampino airport"
authors = [{name="Leonardo Barcaroli", email="blallo@autistici.org"}] authors = [{name="Leonardo Barcaroli", email="blallo@autistici.org"}]
license = {text="Public Domain"} license = {text="Public Domain"}
@ -24,7 +24,6 @@ dev = [
"ipython", "ipython",
"black", "black",
"build", "build",
"ipdb",
] ]
[build-system] [build-system]

View File

@ -1,80 +0,0 @@
# This file was autogenerated by uv v0.1.3 via the following command:
# uv pip compile --all-extras pyproject.toml -o requirements.dev.txt
asttokens==2.4.1
# via stack-data
black==24.2.0
build==1.0.3
certifi==2024.2.2
# via requests
charset-normalizer==3.3.2
# via requests
click==8.1.7
# via black
decorator==5.1.1
# via
# ipdb
# ipython
et-xmlfile==1.1.0
# via openpyxl
executing==2.0.1
# via stack-data
idna==3.6
# via requests
ipdb==0.13.13
ipython==8.21.0
# via ipdb
jedi==0.19.1
# via ipython
lxml==5.1.0
matplotlib-inline==0.1.6
# via ipython
mypy-extensions==1.0.0
# via black
numpy==1.26.4
# via pandas
openpyxl==3.1.2
packaging==23.2
# via
# black
# build
pandas==2.2.0
parso==0.8.3
# via jedi
pathspec==0.12.1
# via black
pexpect==4.9.0
# via ipython
platformdirs==4.2.0
# via black
prompt-toolkit==3.0.43
# via ipython
ptyprocess==0.7.0
# via pexpect
pure-eval==0.2.2
# via stack-data
pygments==2.17.2
# via ipython
pyproject-hooks==1.0.0
# via build
python-dateutil==2.8.2
# via pandas
pytz==2024.1
# via pandas
pyyaml==6.0.1
requests==2.31.0
six==1.16.0
# via
# asttokens
# python-dateutil
stack-data==0.6.3
# via ipython
traitlets==5.14.1
# via
# ipython
# matplotlib-inline
tzdata==2024.1
# via pandas
urllib3==2.2.0
# via requests
wcwidth==0.2.13
# via prompt-toolkit