Compare commits

..

9 Commits

Author SHA1 Message Date
blallo b9d89c3f0c
Bump to v0.5.0 2024-05-06 01:11:10 +02:00
blallo 4f67b1c97a
Iterate over pages 2024-05-06 01:10:43 +02:00
blallo 6e56f90b6b
Remove empty template line 2024-05-06 01:10:23 +02:00
blallo 5405efbbc4
Bump to v0.4.0 2024-05-05 23:29:12 +02:00
blallo 789091f7a8
Use uv machinery and add ipdb 2024-02-17 16:34:08 +01:00
blallo a8b321a47d
Improve devloop 2024-02-17 16:33:52 +01:00
blallo 31f2636dd8
Fix parsing 2024-02-17 16:33:34 +01:00
blallo fa0023d2d1
Bump to v0.3.3 2022-09-27 23:16:11 +02:00
blallo 4d1596c4a2
Make cc REALLY work 2022-09-27 23:15:50 +02:00
10 changed files with 19995 additions and 80 deletions

View File

@ -1,12 +1,10 @@
# -*- encoding: utf-8 -*-
import os
from latecomers.parse import find_table, get_details
with open("./devloop/sample.html") as f:
with open("./devloop/sample3.html") as f:
content = f.read()
flights = find_table(content)
for f in flights:
print(get_details(f, os.environ.get("DEBUG") is not None))
print(get_details(f))

View File

@ -2,14 +2,17 @@
import os
from latecomers.retrieve import retrieve_from_inst, retrieve_from_fr24
from latecomers.parse import find_table, get_details, parse_fr24
body = retrieve_from_inst()
flights = find_table(body)
aux_data = parse_fr24(retrieve_from_fr24())
from latecomers.parse import count_pages, find_table, get_details, parse_fr24
breakpoint()
aux_data = parse_fr24(retrieve_from_fr24())
body = retrieve_from_inst()
pages = count_pages(body)
flights = find_table(body)
for page in range(2, pages + 1):
body = retrieve_from_inst(page)
flights.extend(find_table(body))
for f in flights:
print(get_details(f, aux_data=aux_data, debug=os.environ.get("DEBUG") is not None))
print(get_details(f, aux_data=aux_data))

19795
devloop/sample3.html 100644

File diff suppressed because one or more lines are too long

View File

@ -1,22 +1,25 @@
# -*- encoding: utf-8 -*-
import os
import pandas as pd
from latecomers.parse import find_table, get_details
from latecomers.serializer import to_excel
with open("./devloop/sample.html") as f:
with open("./devloop/sample3.html") as f:
content = f.read()
flights = find_table(content)
data = []
for f in flights:
data.append(get_details(f, os.environ.get("DEBUG") is not None))
data.append(get_details(f))
print(to_excel(data))
colonne = {"th_arrival": "Arrivo teorico", "real_arrival": "Arrivo reale",
"code": "Codice volo", "origin": "Aeroporto di partenza", "status": "Stato"}
colonne = {
"th_arrival": "Arrivo teorico",
"real_arrival": "Arrivo reale",
"code": "Codice volo",
"origin": "Aeroporto di partenza",
"status": "Stato",
}
df = pd.DataFrame(data, columns=colonne)

View File

@ -4,7 +4,7 @@ import sys
import typing as T
from latecomers.retrieve import retrieve_from_inst, retrieve_from_fr24
from latecomers.parse import find_table, get_details, Details, parse_fr24
from latecomers.parse import count_pages, find_table, get_details, Details, parse_fr24
from latecomers.serializer import to_excel
from latecomers.notifier import Notifier
from latecomers.config import Config
@ -23,11 +23,19 @@ def main(config: Config):
The main cli entrypoint.
"""
out = Notifier(**config.smtp)
body = retrieve_from_inst()
pages = count_pages(body)
logger.info(f"found all data in {pages} page(s)")
table = find_table(body)
for page in range(2, pages + 1):
body = retrieve_from_inst(page)
table.extend(find_table(body))
fr24_data = retrieve_from_fr24()
aux_data = parse_fr24(fr24_data)
data: T.List[Details] = []
for row in table:
data.append(get_details(row, aux_data))

View File

@ -84,7 +84,10 @@ class Notifier(object):
email = message.as_string()
self.send(to, email)
rcpt = to
rcpt.extend(cc)
self.send(rcpt, email)
@retry_and_log(logger, RETRIES)
def send_no_data(self, to: T.List[T.Text], cc: T.List[T.Text]) -> None:
@ -107,4 +110,7 @@ il vostro tecnico preferito.
message.attach(MIMEText(body, "plain"))
email = message.as_string()
self.send(to, email)
rcpt = to
rcpt.extend(cc)
self.send(rcpt, email)

View File

@ -20,26 +20,47 @@ logger = logging.getLogger(__name__)
def not_empty(obj: et._Element) -> bool:
if type(obj) is et._Element:
children = len(obj.xpath(".//h5"))
return children == 5 or children == 6
if "lfr-template" in obj.attrib.get("class"):
return False
children = len(obj.xpath(".//td"))
return children in (5, 6)
raise RuntimeError(f"provided argument is of unsupported type: {type(obj)}")
@logit(logger)
def count_pages(html_content: T.Text) -> int:
"""
Count how many pages there are to be accessed
"""
root = et.fromstring(html_content, parser=PARSER)
il_items = root.xpath(
"//div[contains(@data-qa-id, 'paginator')]/ul[contains(@class, 'pagination')]/li"
)
if not il_items:
return 1
return len(il_items) - 2
@logit(logger)
def find_table(html_content: T.Text) -> T.List[et._ElementTree]:
"""
Find the table that holds the data in the html response
"""
root = et.fromstring(html_content, parser=PARSER)
tbody = root.xpath("//tbody[contains(@class, 'table-data')]")
tbody = root.xpath("//tbody")
if not tbody:
return []
if len(tbody) != 1:
raise ValueError(f"Unexpected parsing result: found {len(tbody)} results")
result = [child for child in tbody[0].xpath(".//tr") if not_empty(child)]
result = [
child
for child in tbody[0].xpath(".//tr[contains(@data-qa-id, 'row')]")
if not_empty(child)
]
return result
@ -74,56 +95,73 @@ class Details(object):
status: Status = Status.UNKNOWN
fr24_landing_time: T.Optional[T.Text] = None
def maybe_parse_hour_th(self, h5: et._ElementTree) -> None:
def __init__(self, row: et._ElementTree) -> None:
self.row = row
def maybe_parse_hour_th(self) -> None:
"""
This function fills the fileds related to the theoric arrival hour,
if the input matches some heuristics.
"""
hour = TIME_RE.findall(h5.text)
if len(hour) == 1:
self.th_arrival = hour[0]
if "text-decoration: line-through" in h5.attrib.get("style", ""):
self.real_arrival = None
hour = self.row.xpath(".//span[contains(@class, 'date-estimated__time')]")
if len(hour) != 1:
logger.debug("Cannot parse estimated time")
return
self.th_arrival = hour[0].text
def maybe_parse_hour_real(self, h5: et._ElementTree) -> None:
def maybe_parse_hour_real(self) -> None:
"""
This function fills the fileds related to the theoric arrival hour,
if the input matches some heuristics.
"""
hour = TIME_RE.findall(h5.text)
if len(hour) == 1:
self.real_arrival = hour[0]
hour = self.row.xpath(".//span[contains(@class, 'date-actual__time')]")
if len(hour) != 1:
logger.debug("Cannot parse actual time")
return
self.real_arrival = hour[0].text
def maybe_parse_code(self, h5: et._ElementTree) -> None:
def maybe_parse_code(self) -> None:
"""
This function fills the fileds related to the flight code,
if present and the input matches some heuristics.
"""
if "flight-numb" not in h5.attrib.get("class", ""):
code = self.row.xpath(
".//td[contains(@class, 'lfr-departure-column-column')]//div/a/strong"
)
if len(code) != 1:
logger.debug("Cannot parse code")
return
child = h5.xpath(".//strong")
if len(child) == 1:
self.code = child[0].text.strip("\t\n ").replace(" ", "")
self.code = code[0].text.strip("\t\n ").replace(" ", "")
def maybe_parse_airport(self, h5: et._ElementTree) -> None:
def maybe_parse_airport(self) -> None:
"""
This function fills the field for the airport, if the input matches some
heuristics.
"""
airport = h5.text.strip("\t\n")
if len(airport) > 0 and "blue-title" in h5.attrib.get("class", ""):
self.origin = airport
airport = self.row.xpath(
".//td[contains(@class, 'lfr-flight-departure-column')]/h5"
)
if len(airport) != 1:
logger.debug("Cannot parse airport")
return
self.origin = airport[0].text.strip("\t\n")
def maybe_parse_status(self, h5: et._ElementTree) -> None:
def maybe_parse_status(self) -> None:
"""
This function fills the filed for the status, if the input matches some
heuristics.
"""
_class = h5.attrib.get("class", "")
status = self.row.xpath(
".//td[contains(@class, 'lfr-flight-status-column')]/h5"
)
if len(status) != 1:
logger.debug("Cannot parse status")
return
_class = status[0].attrib.get("class", "")
if "arrivato" in _class or "schedulato" in _class:
self.status = Status.from_str(h5.text)
self.status = Status.from_str(status[0].text)
else:
parsed = STATUS_RE.findall(h5.text)
parsed = STATUS_RE.findall(status[0].text)
if len(parsed) == 1:
self.status = Status.from_str(parsed[0])
@ -133,6 +171,7 @@ class Details(object):
only FlightRadar24 data).
"""
if not self.code:
logger.debug("Cannot add aux data: missing code")
return
self.fr24_landing_time = aux_data.get(self.code)
@ -156,34 +195,18 @@ class Details(object):
def get_details(
table_entry: et._ElementTree,
aux_data: T.Optional[T.Dict[T.Text, T.Text]] = None,
debug: bool = False,
) -> Details:
"""
Find the dates in a table row. If a strikenthrough time is found, it is
returned as second element in the tuple.
"""
res = table_entry.xpath(".//h5")
if len(res) > 6:
raise ValueError(f"Unexpected number of h5 found in line: {len(res)}")
d = Details(table_entry)
if debug:
for r in res:
txt = r.text.strip("\t\n ")
print(f"[DEBUG] text={txt} attrs={r.attrib}")
d = Details()
if len(res) == 5:
d.maybe_parse_hour_th(res[0])
d.maybe_parse_code(res[1])
d.maybe_parse_airport(res[2])
d.maybe_parse_status(res[3])
elif len(res) == 6:
d.maybe_parse_hour_th(res[0])
d.maybe_parse_hour_real(res[1])
d.maybe_parse_code(res[2])
d.maybe_parse_airport(res[3])
d.maybe_parse_status(res[4])
d.maybe_parse_hour_th()
d.maybe_parse_hour_real()
d.maybe_parse_code()
d.maybe_parse_airport()
d.maybe_parse_status()
if aux_data:
d.maybe_add_aux_data(aux_data)
@ -192,7 +215,7 @@ def get_details(
def parse_fr24(
data: T.Optional[T.Dict[T.Text, T.Any]]
data: T.Optional[T.Dict[T.Text, T.Any]],
) -> T.Optional[T.Dict[T.Text, T.Text]]:
"""
This function parses the given FlightRadar24 data into a pandas DataFrame.
@ -206,9 +229,7 @@ def parse_fr24(
results = {}
for flight in data["result"]["response"]["airport"]["pluginData"]["schedule"][
"arrivals"
][
"data"
]: # noqa: E501
]["data"]: # noqa: E501
try:
id_num = flight["flight"]["identification"]["number"]
if _code := id_num.get("default"):

File diff suppressed because one or more lines are too long

View File

@ -1,6 +1,6 @@
[project]
name = "latecomers"
version = "0.3.2"
version = "0.5.0"
description = "Retrieve and save data from ADR Ciampino airport"
authors = [{name="Leonardo Barcaroli", email="blallo@autistici.org"}]
license = {text="Public Domain"}
@ -24,6 +24,7 @@ dev = [
"ipython",
"black",
"build",
"ipdb",
]
[build-system]

View File

@ -0,0 +1,80 @@
# This file was autogenerated by uv v0.1.3 via the following command:
# uv pip compile --all-extras pyproject.toml -o requirements.dev.txt
asttokens==2.4.1
# via stack-data
black==24.2.0
build==1.0.3
certifi==2024.2.2
# via requests
charset-normalizer==3.3.2
# via requests
click==8.1.7
# via black
decorator==5.1.1
# via
# ipdb
# ipython
et-xmlfile==1.1.0
# via openpyxl
executing==2.0.1
# via stack-data
idna==3.6
# via requests
ipdb==0.13.13
ipython==8.21.0
# via ipdb
jedi==0.19.1
# via ipython
lxml==5.1.0
matplotlib-inline==0.1.6
# via ipython
mypy-extensions==1.0.0
# via black
numpy==1.26.4
# via pandas
openpyxl==3.1.2
packaging==23.2
# via
# black
# build
pandas==2.2.0
parso==0.8.3
# via jedi
pathspec==0.12.1
# via black
pexpect==4.9.0
# via ipython
platformdirs==4.2.0
# via black
prompt-toolkit==3.0.43
# via ipython
ptyprocess==0.7.0
# via pexpect
pure-eval==0.2.2
# via stack-data
pygments==2.17.2
# via ipython
pyproject-hooks==1.0.0
# via build
python-dateutil==2.8.2
# via pandas
pytz==2024.1
# via pandas
pyyaml==6.0.1
requests==2.31.0
six==1.16.0
# via
# asttokens
# python-dateutil
stack-data==0.6.3
# via ipython
traitlets==5.14.1
# via
# ipython
# matplotlib-inline
tzdata==2024.1
# via pandas
urllib3==2.2.0
# via requests
wcwidth==0.2.13
# via prompt-toolkit