Compare commits

...

7 Commits

Author SHA1 Message Date
b9d89c3f0c
Bump to v0.5.0 2024-05-06 01:11:10 +02:00
4f67b1c97a
Iterate over pages 2024-05-06 01:10:43 +02:00
6e56f90b6b
Remove empty template line 2024-05-06 01:10:23 +02:00
5405efbbc4
Bump to v0.4.0 2024-05-05 23:29:12 +02:00
789091f7a8
Use uv machinery and add ipdb 2024-02-17 16:34:08 +01:00
a8b321a47d
Improve devloop 2024-02-17 16:33:52 +01:00
31f2636dd8
Fix parsing 2024-02-17 16:33:34 +01:00
9 changed files with 19987 additions and 78 deletions

View File

@ -1,12 +1,10 @@
# -*- encoding: utf-8 -*- # -*- encoding: utf-8 -*-
import os
from latecomers.parse import find_table, get_details from latecomers.parse import find_table, get_details
with open("./devloop/sample.html") as f: with open("./devloop/sample3.html") as f:
content = f.read() content = f.read()
flights = find_table(content) flights = find_table(content)
for f in flights: for f in flights:
print(get_details(f, os.environ.get("DEBUG") is not None)) print(get_details(f))

View File

@ -2,14 +2,17 @@
import os import os
from latecomers.retrieve import retrieve_from_inst, retrieve_from_fr24 from latecomers.retrieve import retrieve_from_inst, retrieve_from_fr24
from latecomers.parse import find_table, get_details, parse_fr24 from latecomers.parse import count_pages, find_table, get_details, parse_fr24
body = retrieve_from_inst()
flights = find_table(body)
aux_data = parse_fr24(retrieve_from_fr24())
breakpoint() breakpoint()
aux_data = parse_fr24(retrieve_from_fr24())
body = retrieve_from_inst()
pages = count_pages(body)
flights = find_table(body)
for page in range(2, pages + 1):
body = retrieve_from_inst(page)
flights.extend(find_table(body))
for f in flights: for f in flights:
print(get_details(f, aux_data=aux_data, debug=os.environ.get("DEBUG") is not None)) print(get_details(f, aux_data=aux_data))

19795
devloop/sample3.html Normal file

File diff suppressed because one or more lines are too long

View File

@ -1,22 +1,25 @@
# -*- encoding: utf-8 -*- # -*- encoding: utf-8 -*-
import os
import pandas as pd import pandas as pd
from latecomers.parse import find_table, get_details from latecomers.parse import find_table, get_details
from latecomers.serializer import to_excel from latecomers.serializer import to_excel
with open("./devloop/sample.html") as f: with open("./devloop/sample3.html") as f:
content = f.read() content = f.read()
flights = find_table(content) flights = find_table(content)
data = [] data = []
for f in flights: for f in flights:
data.append(get_details(f, os.environ.get("DEBUG") is not None)) data.append(get_details(f))
print(to_excel(data)) print(to_excel(data))
colonne = {"th_arrival": "Arrivo teorico", "real_arrival": "Arrivo reale", colonne = {
"code": "Codice volo", "origin": "Aeroporto di partenza", "status": "Stato"} "th_arrival": "Arrivo teorico",
"real_arrival": "Arrivo reale",
"code": "Codice volo",
"origin": "Aeroporto di partenza",
"status": "Stato",
}
df = pd.DataFrame(data, columns=colonne) df = pd.DataFrame(data, columns=colonne)

View File

@ -4,7 +4,7 @@ import sys
import typing as T import typing as T
from latecomers.retrieve import retrieve_from_inst, retrieve_from_fr24 from latecomers.retrieve import retrieve_from_inst, retrieve_from_fr24
from latecomers.parse import find_table, get_details, Details, parse_fr24 from latecomers.parse import count_pages, find_table, get_details, Details, parse_fr24
from latecomers.serializer import to_excel from latecomers.serializer import to_excel
from latecomers.notifier import Notifier from latecomers.notifier import Notifier
from latecomers.config import Config from latecomers.config import Config
@ -23,11 +23,19 @@ def main(config: Config):
The main cli entrypoint. The main cli entrypoint.
""" """
out = Notifier(**config.smtp) out = Notifier(**config.smtp)
body = retrieve_from_inst() body = retrieve_from_inst()
pages = count_pages(body)
logger.info(f"found all data in {pages} page(s)")
table = find_table(body) table = find_table(body)
for page in range(2, pages + 1):
body = retrieve_from_inst(page)
table.extend(find_table(body))
fr24_data = retrieve_from_fr24() fr24_data = retrieve_from_fr24()
aux_data = parse_fr24(fr24_data) aux_data = parse_fr24(fr24_data)
data: T.List[Details] = [] data: T.List[Details] = []
for row in table: for row in table:
data.append(get_details(row, aux_data)) data.append(get_details(row, aux_data))

View File

@ -20,26 +20,47 @@ logger = logging.getLogger(__name__)
def not_empty(obj: et._Element) -> bool: def not_empty(obj: et._Element) -> bool:
if type(obj) is et._Element: if type(obj) is et._Element:
children = len(obj.xpath(".//h5")) if "lfr-template" in obj.attrib.get("class"):
return children == 5 or children == 6 return False
children = len(obj.xpath(".//td"))
return children in (5, 6)
raise RuntimeError(f"provided argument is of unsupported type: {type(obj)}") raise RuntimeError(f"provided argument is of unsupported type: {type(obj)}")
@logit(logger)
def count_pages(html_content: T.Text) -> int:
"""
Count how many pages there are to be accessed
"""
root = et.fromstring(html_content, parser=PARSER)
il_items = root.xpath(
"//div[contains(@data-qa-id, 'paginator')]/ul[contains(@class, 'pagination')]/li"
)
if not il_items:
return 1
return len(il_items) - 2
@logit(logger) @logit(logger)
def find_table(html_content: T.Text) -> T.List[et._ElementTree]: def find_table(html_content: T.Text) -> T.List[et._ElementTree]:
""" """
Find the table that holds the data in the html response Find the table that holds the data in the html response
""" """
root = et.fromstring(html_content, parser=PARSER) root = et.fromstring(html_content, parser=PARSER)
tbody = root.xpath("//tbody[contains(@class, 'table-data')]") tbody = root.xpath("//tbody")
if not tbody: if not tbody:
return [] return []
if len(tbody) != 1: if len(tbody) != 1:
raise ValueError(f"Unexpected parsing result: found {len(tbody)} results") raise ValueError(f"Unexpected parsing result: found {len(tbody)} results")
result = [child for child in tbody[0].xpath(".//tr") if not_empty(child)] result = [
child
for child in tbody[0].xpath(".//tr[contains(@data-qa-id, 'row')]")
if not_empty(child)
]
return result return result
@ -74,56 +95,73 @@ class Details(object):
status: Status = Status.UNKNOWN status: Status = Status.UNKNOWN
fr24_landing_time: T.Optional[T.Text] = None fr24_landing_time: T.Optional[T.Text] = None
def maybe_parse_hour_th(self, h5: et._ElementTree) -> None: def __init__(self, row: et._ElementTree) -> None:
self.row = row
def maybe_parse_hour_th(self) -> None:
""" """
This function fills the fileds related to the theoric arrival hour, This function fills the fileds related to the theoric arrival hour,
if the input matches some heuristics. if the input matches some heuristics.
""" """
hour = TIME_RE.findall(h5.text) hour = self.row.xpath(".//span[contains(@class, 'date-estimated__time')]")
if len(hour) == 1: if len(hour) != 1:
self.th_arrival = hour[0] logger.debug("Cannot parse estimated time")
if "text-decoration: line-through" in h5.attrib.get("style", ""): return
self.real_arrival = None self.th_arrival = hour[0].text
def maybe_parse_hour_real(self, h5: et._ElementTree) -> None: def maybe_parse_hour_real(self) -> None:
""" """
This function fills the fileds related to the theoric arrival hour, This function fills the fileds related to the theoric arrival hour,
if the input matches some heuristics. if the input matches some heuristics.
""" """
hour = TIME_RE.findall(h5.text) hour = self.row.xpath(".//span[contains(@class, 'date-actual__time')]")
if len(hour) == 1: if len(hour) != 1:
self.real_arrival = hour[0] logger.debug("Cannot parse actual time")
return
self.real_arrival = hour[0].text
def maybe_parse_code(self, h5: et._ElementTree) -> None: def maybe_parse_code(self) -> None:
""" """
This function fills the fileds related to the flight code, This function fills the fileds related to the flight code,
if present and the input matches some heuristics. if present and the input matches some heuristics.
""" """
if "flight-numb" not in h5.attrib.get("class", ""): code = self.row.xpath(
".//td[contains(@class, 'lfr-departure-column-column')]//div/a/strong"
)
if len(code) != 1:
logger.debug("Cannot parse code")
return return
child = h5.xpath(".//strong") self.code = code[0].text.strip("\t\n ").replace(" ", "")
if len(child) == 1:
self.code = child[0].text.strip("\t\n ").replace(" ", "")
def maybe_parse_airport(self, h5: et._ElementTree) -> None: def maybe_parse_airport(self) -> None:
""" """
This function fills the field for the airport, if the input matches some This function fills the field for the airport, if the input matches some
heuristics. heuristics.
""" """
airport = h5.text.strip("\t\n") airport = self.row.xpath(
if len(airport) > 0 and "blue-title" in h5.attrib.get("class", ""): ".//td[contains(@class, 'lfr-flight-departure-column')]/h5"
self.origin = airport )
if len(airport) != 1:
logger.debug("Cannot parse airport")
return
self.origin = airport[0].text.strip("\t\n")
def maybe_parse_status(self, h5: et._ElementTree) -> None: def maybe_parse_status(self) -> None:
""" """
This function fills the filed for the status, if the input matches some This function fills the filed for the status, if the input matches some
heuristics. heuristics.
""" """
_class = h5.attrib.get("class", "") status = self.row.xpath(
".//td[contains(@class, 'lfr-flight-status-column')]/h5"
)
if len(status) != 1:
logger.debug("Cannot parse status")
return
_class = status[0].attrib.get("class", "")
if "arrivato" in _class or "schedulato" in _class: if "arrivato" in _class or "schedulato" in _class:
self.status = Status.from_str(h5.text) self.status = Status.from_str(status[0].text)
else: else:
parsed = STATUS_RE.findall(h5.text) parsed = STATUS_RE.findall(status[0].text)
if len(parsed) == 1: if len(parsed) == 1:
self.status = Status.from_str(parsed[0]) self.status = Status.from_str(parsed[0])
@ -133,6 +171,7 @@ class Details(object):
only FlightRadar24 data). only FlightRadar24 data).
""" """
if not self.code: if not self.code:
logger.debug("Cannot add aux data: missing code")
return return
self.fr24_landing_time = aux_data.get(self.code) self.fr24_landing_time = aux_data.get(self.code)
@ -156,34 +195,18 @@ class Details(object):
def get_details( def get_details(
table_entry: et._ElementTree, table_entry: et._ElementTree,
aux_data: T.Optional[T.Dict[T.Text, T.Text]] = None, aux_data: T.Optional[T.Dict[T.Text, T.Text]] = None,
debug: bool = False,
) -> Details: ) -> Details:
""" """
Find the dates in a table row. If a strikenthrough time is found, it is Find the dates in a table row. If a strikenthrough time is found, it is
returned as second element in the tuple. returned as second element in the tuple.
""" """
res = table_entry.xpath(".//h5") d = Details(table_entry)
if len(res) > 6:
raise ValueError(f"Unexpected number of h5 found in line: {len(res)}")
if debug: d.maybe_parse_hour_th()
for r in res: d.maybe_parse_hour_real()
txt = r.text.strip("\t\n ") d.maybe_parse_code()
print(f"[DEBUG] text={txt} attrs={r.attrib}") d.maybe_parse_airport()
d.maybe_parse_status()
d = Details()
if len(res) == 5:
d.maybe_parse_hour_th(res[0])
d.maybe_parse_code(res[1])
d.maybe_parse_airport(res[2])
d.maybe_parse_status(res[3])
elif len(res) == 6:
d.maybe_parse_hour_th(res[0])
d.maybe_parse_hour_real(res[1])
d.maybe_parse_code(res[2])
d.maybe_parse_airport(res[3])
d.maybe_parse_status(res[4])
if aux_data: if aux_data:
d.maybe_add_aux_data(aux_data) d.maybe_add_aux_data(aux_data)
@ -192,7 +215,7 @@ def get_details(
def parse_fr24( def parse_fr24(
data: T.Optional[T.Dict[T.Text, T.Any]] data: T.Optional[T.Dict[T.Text, T.Any]],
) -> T.Optional[T.Dict[T.Text, T.Text]]: ) -> T.Optional[T.Dict[T.Text, T.Text]]:
""" """
This function parses the given FlightRadar24 data into a pandas DataFrame. This function parses the given FlightRadar24 data into a pandas DataFrame.
@ -206,9 +229,7 @@ def parse_fr24(
results = {} results = {}
for flight in data["result"]["response"]["airport"]["pluginData"]["schedule"][ for flight in data["result"]["response"]["airport"]["pluginData"]["schedule"][
"arrivals" "arrivals"
][ ]["data"]: # noqa: E501
"data"
]: # noqa: E501
try: try:
id_num = flight["flight"]["identification"]["number"] id_num = flight["flight"]["identification"]["number"]
if _code := id_num.get("default"): if _code := id_num.get("default"):

File diff suppressed because one or more lines are too long

View File

@ -1,6 +1,6 @@
[project] [project]
name = "latecomers" name = "latecomers"
version = "0.3.3" version = "0.5.0"
description = "Retrieve and save data from ADR Ciampino airport" description = "Retrieve and save data from ADR Ciampino airport"
authors = [{name="Leonardo Barcaroli", email="blallo@autistici.org"}] authors = [{name="Leonardo Barcaroli", email="blallo@autistici.org"}]
license = {text="Public Domain"} license = {text="Public Domain"}
@ -24,6 +24,7 @@ dev = [
"ipython", "ipython",
"black", "black",
"build", "build",
"ipdb",
] ]
[build-system] [build-system]

80
requirements.dev.txt Normal file
View File

@ -0,0 +1,80 @@
# This file was autogenerated by uv v0.1.3 via the following command:
# uv pip compile --all-extras pyproject.toml -o requirements.dev.txt
asttokens==2.4.1
# via stack-data
black==24.2.0
build==1.0.3
certifi==2024.2.2
# via requests
charset-normalizer==3.3.2
# via requests
click==8.1.7
# via black
decorator==5.1.1
# via
# ipdb
# ipython
et-xmlfile==1.1.0
# via openpyxl
executing==2.0.1
# via stack-data
idna==3.6
# via requests
ipdb==0.13.13
ipython==8.21.0
# via ipdb
jedi==0.19.1
# via ipython
lxml==5.1.0
matplotlib-inline==0.1.6
# via ipython
mypy-extensions==1.0.0
# via black
numpy==1.26.4
# via pandas
openpyxl==3.1.2
packaging==23.2
# via
# black
# build
pandas==2.2.0
parso==0.8.3
# via jedi
pathspec==0.12.1
# via black
pexpect==4.9.0
# via ipython
platformdirs==4.2.0
# via black
prompt-toolkit==3.0.43
# via ipython
ptyprocess==0.7.0
# via pexpect
pure-eval==0.2.2
# via stack-data
pygments==2.17.2
# via ipython
pyproject-hooks==1.0.0
# via build
python-dateutil==2.8.2
# via pandas
pytz==2024.1
# via pandas
pyyaml==6.0.1
requests==2.31.0
six==1.16.0
# via
# asttokens
# python-dateutil
stack-data==0.6.3
# via ipython
traitlets==5.14.1
# via
# ipython
# matplotlib-inline
tzdata==2024.1
# via pandas
urllib3==2.2.0
# via requests
wcwidth==0.2.13
# via prompt-toolkit