Compare commits

..

No commits in common. "master" and "v0.2.0" have entirely different histories.

13 changed files with 96 additions and 20045 deletions

View File

@ -1,10 +1,12 @@
# -*- encoding: utf-8 -*- # -*- encoding: utf-8 -*-
import os
from latecomers.parse import find_table, get_details from latecomers.parse import find_table, get_details
with open("./devloop/sample3.html") as f: with open("./devloop/sample.html") as f:
content = f.read() content = f.read()
flights = find_table(content) flights = find_table(content)
for f in flights: for f in flights:
print(get_details(f)) print(get_details(f, os.environ.get("DEBUG") is not None))

View File

@ -2,17 +2,14 @@
import os import os
from latecomers.retrieve import retrieve_from_inst, retrieve_from_fr24 from latecomers.retrieve import retrieve_from_inst, retrieve_from_fr24
from latecomers.parse import count_pages, find_table, get_details, parse_fr24 from latecomers.parse import find_table, get_details, parse_fr24
breakpoint()
aux_data = parse_fr24(retrieve_from_fr24())
body = retrieve_from_inst() body = retrieve_from_inst()
pages = count_pages(body)
flights = find_table(body)
for page in range(2, pages + 1):
body = retrieve_from_inst(page)
flights.extend(find_table(body))
flights = find_table(body)
aux_data = parse_fr24(retrieve_from_fr24())
breakpoint()
for f in flights: for f in flights:
print(get_details(f, aux_data=aux_data)) print(get_details(f, aux_data=aux_data, debug=os.environ.get("DEBUG") is not None))

File diff suppressed because one or more lines are too long

View File

@ -1,25 +1,22 @@
# -*- encoding: utf-8 -*- # -*- encoding: utf-8 -*-
import os
import pandas as pd import pandas as pd
from latecomers.parse import find_table, get_details from latecomers.parse import find_table, get_details
from latecomers.serializer import to_excel from latecomers.serializer import to_excel
with open("./devloop/sample3.html") as f: with open("./devloop/sample.html") as f:
content = f.read() content = f.read()
flights = find_table(content) flights = find_table(content)
data = [] data = []
for f in flights: for f in flights:
data.append(get_details(f)) data.append(get_details(f, os.environ.get("DEBUG") is not None))
print(to_excel(data)) print(to_excel(data))
colonne = { colonne = {"th_arrival": "Arrivo teorico", "real_arrival": "Arrivo reale",
"th_arrival": "Arrivo teorico", "code": "Codice volo", "origin": "Aeroporto di partenza", "status": "Stato"}
"real_arrival": "Arrivo reale",
"code": "Codice volo",
"origin": "Aeroporto di partenza",
"status": "Stato",
}
df = pd.DataFrame(data, columns=colonne) df = pd.DataFrame(data, columns=colonne)

View File

@ -20,8 +20,6 @@ def get_section(obj: T.Dict[T.Text, T.Any], key: T.Text) -> T.Any:
class Config(object): class Config(object):
smtp: T.Dict[T.Text, T.Any] = {} smtp: T.Dict[T.Text, T.Any] = {}
to: T.List[T.Text] = [] to: T.List[T.Text] = []
cc: T.List[T.Text] = []
store: T.Optional[T.Text] = None
def __init__(self, path: T.Text) -> None: def __init__(self, path: T.Text) -> None:
self.from_file(path) self.from_file(path)
@ -36,9 +34,6 @@ class Config(object):
self.to = get_section(content, "to") self.to = get_section(content, "to")
if cc := content.get("cc"):
self.cc = cc
smtp = get_section(content, "smtp") smtp = get_section(content, "smtp")
for key in [ for key in [
"smtp_addr", "smtp_addr",
@ -51,7 +46,5 @@ class Config(object):
self.smtp["smtp_from"] = smtp.get("smtp_from") self.smtp["smtp_from"] = smtp.get("smtp_from")
self.store = content.get("store")
def __str__(self) -> T.Text: def __str__(self) -> T.Text:
return f"Config<smtp={self.smtp},to={self.to}>" return f"Config<smtp={self.smtp},to={self.to}>"

View File

@ -1,5 +1,4 @@
# -*- encoding: utf-8 -*- # -*- encoding: utf-8 -*-
from datetime import datetime, timedelta
import functools import functools
import logging import logging
import sys import sys
@ -49,10 +48,3 @@ def retry_and_log(logger: logging.Logger, retries: int):
return inner return inner
return wrapper return wrapper
def get_date() -> T.Text:
"""
Get yesterday's date in %Y-%m-%d format.
"""
return (datetime.now() - timedelta(days=1)).strftime("%Y-%m-%d")

View File

@ -4,7 +4,7 @@ import sys
import typing as T import typing as T
from latecomers.retrieve import retrieve_from_inst, retrieve_from_fr24 from latecomers.retrieve import retrieve_from_inst, retrieve_from_fr24
from latecomers.parse import count_pages, find_table, get_details, Details, parse_fr24 from latecomers.parse import find_table, get_details, Details, parse_fr24
from latecomers.serializer import to_excel from latecomers.serializer import to_excel
from latecomers.notifier import Notifier from latecomers.notifier import Notifier
from latecomers.config import Config from latecomers.config import Config
@ -23,27 +23,19 @@ def main(config: Config):
The main cli entrypoint. The main cli entrypoint.
""" """
out = Notifier(**config.smtp) out = Notifier(**config.smtp)
body = retrieve_from_inst() body = retrieve_from_inst()
pages = count_pages(body)
logger.info(f"found all data in {pages} page(s)")
table = find_table(body) table = find_table(body)
for page in range(2, pages + 1):
body = retrieve_from_inst(page)
table.extend(find_table(body))
fr24_data = retrieve_from_fr24() fr24_data = retrieve_from_fr24()
aux_data = parse_fr24(fr24_data) aux_data = parse_fr24(fr24_data)
data: T.List[Details] = [] data: T.List[Details] = []
for row in table: for row in table:
data.append(get_details(row, aux_data)) data.append(get_details(row, aux_data))
if not data: if not data:
out.send_no_data(config.to, config.cc) out.send_no_data(config.to)
excel = to_excel(data, config.store) excel = to_excel(data)
out.send_result(config.to, config.cc, excel) out.send_result(config.to, excel)
def cli(): def cli():

View File

@ -1,6 +1,6 @@
# -*- encoding: utf-8 -*- # -*- encoding: utf-8 -*-
from contextlib import contextmanager from contextlib import contextmanager
from datetime import datetime from datetime import datetime, timedelta
from email import encoders from email import encoders
from email.mime.base import MIMEBase from email.mime.base import MIMEBase
from email.mime.multipart import MIMEMultipart from email.mime.multipart import MIMEMultipart
@ -10,7 +10,8 @@ import smtplib
import ssl import ssl
import typing as T import typing as T
from latecomers.helpers import retry_and_log, get_date
from latecomers.helpers import retry_and_log
RETRIES = 3 RETRIES = 3
@ -56,18 +57,13 @@ class Notifier(object):
conn.sendmail(self._from, to, email) conn.sendmail(self._from, to, email)
@retry_and_log(logger, RETRIES) @retry_and_log(logger, RETRIES)
def send_result( def send_result(self, to: T.List[T.Text], result: bytes) -> None:
self, to: T.List[T.Text], cc: T.List[T.Text], result: bytes date = (datetime.now() - timedelta(days=1)).strftime("%Y-%m-%d")
) -> None:
date = get_date()
body = f"Resoconto dei voli dal sito di AdR per l'aereoporto di Ciampino in data {date}" # noqa: E501 body = f"Resoconto dei voli dal sito di AdR per l'aereoporto di Ciampino in data {date}" # noqa: E501
message = MIMEMultipart() message = MIMEMultipart()
message["Date"] = datetime.now().strftime("%a, %d %b %Y %H:%M:%S %z")
message["From"] = self._from message["From"] = self._from
message["To"] = ",".join(to) message["To"] = ",".join(to)
if cc:
message["Cc"] = ",".join(cc)
message["Subject"] = f"[{date}] Resoconto CIA da AdR" message["Subject"] = f"[{date}] Resoconto CIA da AdR"
message.attach(MIMEText(body, "plain")) message.attach(MIMEText(body, "plain"))
@ -84,14 +80,11 @@ class Notifier(object):
email = message.as_string() email = message.as_string()
rcpt = to self.send(to, email)
rcpt.extend(cc)
self.send(rcpt, email)
@retry_and_log(logger, RETRIES) @retry_and_log(logger, RETRIES)
def send_no_data(self, to: T.List[T.Text], cc: T.List[T.Text]) -> None: def send_no_data(self, to: T.List[T.Text]) -> None:
date = get_date() date = (datetime.now() - timedelta(days=1)).strftime("%Y-%m-%d")
body = f"""Attenzione body = f"""Attenzione
Nessun dato è stato trovato per i voli in data {date} dal sito di AdR per Nessun dato è stato trovato per i voli in data {date} dal sito di AdR per
@ -100,17 +93,11 @@ il vostro tecnico preferito.
""" """
message = MIMEMultipart() message = MIMEMultipart()
message["Date"] = datetime.now().strftime("%a, %d %b %Y %H:%M:%S %z")
message["From"] = self._from message["From"] = self._from
message["To"] = ",".join(to) message["To"] = ",".join(to)
if cc:
message["Cc"] = ",".join(cc)
message["Subject"] = f"ATTENZIONE: [{date}] Resoconto CIA da AdR" message["Subject"] = f"ATTENZIONE: [{date}] Resoconto CIA da AdR"
message.attach(MIMEText(body, "plain")) message.attach(MIMEText(body, "plain"))
email = message.as_string() email = message.as_string()
rcpt = to self.send(to, email)
rcpt.extend(cc)
self.send(rcpt, email)

View File

@ -20,47 +20,26 @@ logger = logging.getLogger(__name__)
def not_empty(obj: et._Element) -> bool: def not_empty(obj: et._Element) -> bool:
if type(obj) is et._Element: if type(obj) is et._Element:
if "lfr-template" in obj.attrib.get("class"): children = len(obj.xpath(".//h5"))
return False return children == 5 or children == 6
children = len(obj.xpath(".//td"))
return children in (5, 6)
raise RuntimeError(f"provided argument is of unsupported type: {type(obj)}") raise RuntimeError(f"provided argument is of unsupported type: {type(obj)}")
@logit(logger)
def count_pages(html_content: T.Text) -> int:
"""
Count how many pages there are to be accessed
"""
root = et.fromstring(html_content, parser=PARSER)
il_items = root.xpath(
"//div[contains(@data-qa-id, 'paginator')]/ul[contains(@class, 'pagination')]/li"
)
if not il_items:
return 1
return len(il_items) - 2
@logit(logger) @logit(logger)
def find_table(html_content: T.Text) -> T.List[et._ElementTree]: def find_table(html_content: T.Text) -> T.List[et._ElementTree]:
""" """
Find the table that holds the data in the html response Find the table that holds the data in the html response
""" """
root = et.fromstring(html_content, parser=PARSER) root = et.fromstring(html_content, parser=PARSER)
tbody = root.xpath("//tbody") tbody = root.xpath("//tbody[contains(@class, 'table-data')]")
if not tbody: if not tbody:
return [] return []
if len(tbody) != 1: if len(tbody) != 1:
raise ValueError(f"Unexpected parsing result: found {len(tbody)} results") raise ValueError(f"Unexpected parsing result: found {len(tbody)} results")
result = [ result = [child for child in tbody[0].xpath(".//tr") if not_empty(child)]
child
for child in tbody[0].xpath(".//tr[contains(@data-qa-id, 'row')]")
if not_empty(child)
]
return result return result
@ -95,73 +74,56 @@ class Details(object):
status: Status = Status.UNKNOWN status: Status = Status.UNKNOWN
fr24_landing_time: T.Optional[T.Text] = None fr24_landing_time: T.Optional[T.Text] = None
def __init__(self, row: et._ElementTree) -> None: def maybe_parse_hour_th(self, h5: et._ElementTree) -> None:
self.row = row
def maybe_parse_hour_th(self) -> None:
""" """
This function fills the fileds related to the theoric arrival hour, This function fills the fileds related to the theoric arrival hour,
if the input matches some heuristics. if the input matches some heuristics.
""" """
hour = self.row.xpath(".//span[contains(@class, 'date-estimated__time')]") hour = TIME_RE.findall(h5.text)
if len(hour) != 1: if len(hour) == 1:
logger.debug("Cannot parse estimated time") self.th_arrival = hour[0]
return if "text-decoration: line-through" in h5.attrib.get("style", ""):
self.th_arrival = hour[0].text self.real_arrival = None
def maybe_parse_hour_real(self) -> None: def maybe_parse_hour_real(self, h5: et._ElementTree) -> None:
""" """
This function fills the fileds related to the theoric arrival hour, This function fills the fileds related to the theoric arrival hour,
if the input matches some heuristics. if the input matches some heuristics.
""" """
hour = self.row.xpath(".//span[contains(@class, 'date-actual__time')]") hour = TIME_RE.findall(h5.text)
if len(hour) != 1: if len(hour) == 1:
logger.debug("Cannot parse actual time") self.real_arrival = hour[0]
return
self.real_arrival = hour[0].text
def maybe_parse_code(self) -> None: def maybe_parse_code(self, h5: et._ElementTree) -> None:
""" """
This function fills the fileds related to the flight code, This function fills the fileds related to the flight code,
if present and the input matches some heuristics. if present and the input matches some heuristics.
""" """
code = self.row.xpath( if "flight-numb" not in h5.attrib.get("class", ""):
".//td[contains(@class, 'lfr-departure-column-column')]//div/a/strong"
)
if len(code) != 1:
logger.debug("Cannot parse code")
return return
self.code = code[0].text.strip("\t\n ").replace(" ", "") child = h5.xpath(".//strong")
if len(child) == 1:
self.code = child[0].text.strip("\t\n ").replace(" ", "")
def maybe_parse_airport(self) -> None: def maybe_parse_airport(self, h5: et._ElementTree) -> None:
""" """
This function fills the field for the airport, if the input matches some This function fills the field for the airport, if the input matches some
heuristics. heuristics.
""" """
airport = self.row.xpath( airport = h5.text.strip("\t\n")
".//td[contains(@class, 'lfr-flight-departure-column')]/h5" if len(airport) > 0 and "blue-title" in h5.attrib.get("class", ""):
) self.origin = airport
if len(airport) != 1:
logger.debug("Cannot parse airport")
return
self.origin = airport[0].text.strip("\t\n")
def maybe_parse_status(self) -> None: def maybe_parse_status(self, h5: et._ElementTree) -> None:
""" """
This function fills the filed for the status, if the input matches some This function fills the filed for the status, if the input matches some
heuristics. heuristics.
""" """
status = self.row.xpath( _class = h5.attrib.get("class", "")
".//td[contains(@class, 'lfr-flight-status-column')]/h5"
)
if len(status) != 1:
logger.debug("Cannot parse status")
return
_class = status[0].attrib.get("class", "")
if "arrivato" in _class or "schedulato" in _class: if "arrivato" in _class or "schedulato" in _class:
self.status = Status.from_str(status[0].text) self.status = Status.from_str(h5.text)
else: else:
parsed = STATUS_RE.findall(status[0].text) parsed = STATUS_RE.findall(h5.text)
if len(parsed) == 1: if len(parsed) == 1:
self.status = Status.from_str(parsed[0]) self.status = Status.from_str(parsed[0])
@ -171,7 +133,6 @@ class Details(object):
only FlightRadar24 data). only FlightRadar24 data).
""" """
if not self.code: if not self.code:
logger.debug("Cannot add aux data: missing code")
return return
self.fr24_landing_time = aux_data.get(self.code) self.fr24_landing_time = aux_data.get(self.code)
@ -195,18 +156,34 @@ class Details(object):
def get_details( def get_details(
table_entry: et._ElementTree, table_entry: et._ElementTree,
aux_data: T.Optional[T.Dict[T.Text, T.Text]] = None, aux_data: T.Optional[T.Dict[T.Text, T.Text]] = None,
debug: bool = False,
) -> Details: ) -> Details:
""" """
Find the dates in a table row. If a strikenthrough time is found, it is Find the dates in a table row. If a strikenthrough time is found, it is
returned as second element in the tuple. returned as second element in the tuple.
""" """
d = Details(table_entry) res = table_entry.xpath(".//h5")
if len(res) > 6:
raise ValueError(f"Unexpected number of h5 found in line: {len(res)}")
d.maybe_parse_hour_th() if debug:
d.maybe_parse_hour_real() for r in res:
d.maybe_parse_code() txt = r.text.strip("\t\n ")
d.maybe_parse_airport() print(f"[DEBUG] text={txt} attrs={r.attrib}")
d.maybe_parse_status()
d = Details()
if len(res) == 5:
d.maybe_parse_hour_th(res[0])
d.maybe_parse_code(res[1])
d.maybe_parse_airport(res[2])
d.maybe_parse_status(res[3])
elif len(res) == 6:
d.maybe_parse_hour_th(res[0])
d.maybe_parse_hour_real(res[1])
d.maybe_parse_code(res[2])
d.maybe_parse_airport(res[3])
d.maybe_parse_status(res[4])
if aux_data: if aux_data:
d.maybe_add_aux_data(aux_data) d.maybe_add_aux_data(aux_data)
@ -215,13 +192,11 @@ def get_details(
def parse_fr24( def parse_fr24(
data: T.Optional[T.Dict[T.Text, T.Any]], data: T.Optional[T.Dict[T.Text, T.Any]]
) -> T.Optional[T.Dict[T.Text, T.Text]]: ) -> T.Optional[T.Dict[T.Text, T.Text]]:
""" """
This function parses the given FlightRadar24 data into a pandas DataFrame. This function parses the given FlightRadar24 data into a pandas DataFrame.
""" """
logger.debug("fr24 raw data: %s", data)
if not data: if not data:
return None return None
@ -229,12 +204,14 @@ def parse_fr24(
results = {} results = {}
for flight in data["result"]["response"]["airport"]["pluginData"]["schedule"][ for flight in data["result"]["response"]["airport"]["pluginData"]["schedule"][
"arrivals" "arrivals"
]["data"]: # noqa: E501 ][
"data"
]: # noqa: E501
try: try:
id_num = flight["flight"]["identification"]["number"] id_num = flight["flight"]["identification"]["number"]
if _code := id_num.get("default"): if (_code := id_num.get("default")):
code = _code code = _code
elif _code := id_num.get("alternative"): elif (_code := id_num.get("alternative")):
code = _code code = _code
else: else:
# skip if no flight code found # skip if no flight code found
@ -242,7 +219,6 @@ def parse_fr24(
ts = flight["flight"]["time"]["real"]["arrival"] ts = flight["flight"]["time"]["real"]["arrival"]
real_arrival = datetime.fromtimestamp(ts).strftime("%H:%M") real_arrival = datetime.fromtimestamp(ts).strftime("%H:%M")
results[code] = real_arrival results[code] = real_arrival
logger.debug(f"{code} -> {real_arrival}")
except: # noqa: E722 except: # noqa: E722
continue continue

File diff suppressed because one or more lines are too long

View File

@ -1,11 +1,10 @@
# -*- encoding: utf-8 -*- # -*- encoding: utf-8 -*-
import logging import logging
import os.path
from tempfile import NamedTemporaryFile from tempfile import NamedTemporaryFile
import typing as T import typing as T
from latecomers.parse import Details from latecomers.parse import Details
from latecomers.helpers import logit, get_date from latecomers.helpers import logit
import pandas as pd import pandas as pd
@ -14,7 +13,7 @@ logger = logging.getLogger(__name__)
@logit(logger) @logit(logger)
def to_excel(data: T.List[Details], dst: T.Optional[T.Text] = None) -> bytes: def to_excel(data: T.List[Details]) -> bytes:
""" """
This function takes the list of parsed rows as input and returns This function takes the list of parsed rows as input and returns
the bytes corresponding to the excel file derived from such lines. the bytes corresponding to the excel file derived from such lines.
@ -39,12 +38,4 @@ def to_excel(data: T.List[Details], dst: T.Optional[T.Text] = None) -> bytes:
tmp.seek(0) tmp.seek(0)
content = tmp.read() content = tmp.read()
if dst:
filepath = os.path.join(dst, f"{get_date()}.xlsx")
try:
with open(filepath, "wb") as out:
out.write(content)
except Exception as e:
logger.warning(f"Cannot save to path '{filepath}': {e}")
return content return content

View File

@ -1,6 +1,6 @@
[project] [project]
name = "latecomers" name = "latecomers"
version = "0.5.0" version = "0.2.0"
description = "Retrieve and save data from ADR Ciampino airport" description = "Retrieve and save data from ADR Ciampino airport"
authors = [{name="Leonardo Barcaroli", email="blallo@autistici.org"}] authors = [{name="Leonardo Barcaroli", email="blallo@autistici.org"}]
license = {text="Public Domain"} license = {text="Public Domain"}
@ -24,7 +24,6 @@ dev = [
"ipython", "ipython",
"black", "black",
"build", "build",
"ipdb",
] ]
[build-system] [build-system]

View File

@ -1,80 +0,0 @@
# This file was autogenerated by uv v0.1.3 via the following command:
# uv pip compile --all-extras pyproject.toml -o requirements.dev.txt
asttokens==2.4.1
# via stack-data
black==24.2.0
build==1.0.3
certifi==2024.2.2
# via requests
charset-normalizer==3.3.2
# via requests
click==8.1.7
# via black
decorator==5.1.1
# via
# ipdb
# ipython
et-xmlfile==1.1.0
# via openpyxl
executing==2.0.1
# via stack-data
idna==3.6
# via requests
ipdb==0.13.13
ipython==8.21.0
# via ipdb
jedi==0.19.1
# via ipython
lxml==5.1.0
matplotlib-inline==0.1.6
# via ipython
mypy-extensions==1.0.0
# via black
numpy==1.26.4
# via pandas
openpyxl==3.1.2
packaging==23.2
# via
# black
# build
pandas==2.2.0
parso==0.8.3
# via jedi
pathspec==0.12.1
# via black
pexpect==4.9.0
# via ipython
platformdirs==4.2.0
# via black
prompt-toolkit==3.0.43
# via ipython
ptyprocess==0.7.0
# via pexpect
pure-eval==0.2.2
# via stack-data
pygments==2.17.2
# via ipython
pyproject-hooks==1.0.0
# via build
python-dateutil==2.8.2
# via pandas
pytz==2024.1
# via pandas
pyyaml==6.0.1
requests==2.31.0
six==1.16.0
# via
# asttokens
# python-dateutil
stack-data==0.6.3
# via ipython
traitlets==5.14.1
# via
# ipython
# matplotlib-inline
tzdata==2024.1
# via pandas
urllib3==2.2.0
# via requests
wcwidth==0.2.13
# via prompt-toolkit