Compare commits
7 Commits
fa0023d2d1
...
b9d89c3f0c
Author | SHA1 | Date | |
---|---|---|---|
b9d89c3f0c | |||
4f67b1c97a | |||
6e56f90b6b | |||
5405efbbc4 | |||
789091f7a8 | |||
a8b321a47d | |||
31f2636dd8 |
|
@ -1,12 +1,10 @@
|
||||||
# -*- encoding: utf-8 -*-
|
# -*- encoding: utf-8 -*-
|
||||||
import os
|
|
||||||
|
|
||||||
from latecomers.parse import find_table, get_details
|
from latecomers.parse import find_table, get_details
|
||||||
|
|
||||||
with open("./devloop/sample.html") as f:
|
with open("./devloop/sample3.html") as f:
|
||||||
content = f.read()
|
content = f.read()
|
||||||
|
|
||||||
flights = find_table(content)
|
flights = find_table(content)
|
||||||
|
|
||||||
for f in flights:
|
for f in flights:
|
||||||
print(get_details(f, os.environ.get("DEBUG") is not None))
|
print(get_details(f))
|
||||||
|
|
|
@ -2,14 +2,17 @@
|
||||||
import os
|
import os
|
||||||
|
|
||||||
from latecomers.retrieve import retrieve_from_inst, retrieve_from_fr24
|
from latecomers.retrieve import retrieve_from_inst, retrieve_from_fr24
|
||||||
from latecomers.parse import find_table, get_details, parse_fr24
|
from latecomers.parse import count_pages, find_table, get_details, parse_fr24
|
||||||
|
|
||||||
body = retrieve_from_inst()
|
|
||||||
|
|
||||||
flights = find_table(body)
|
|
||||||
|
|
||||||
aux_data = parse_fr24(retrieve_from_fr24())
|
|
||||||
|
|
||||||
breakpoint()
|
breakpoint()
|
||||||
|
aux_data = parse_fr24(retrieve_from_fr24())
|
||||||
|
|
||||||
|
body = retrieve_from_inst()
|
||||||
|
pages = count_pages(body)
|
||||||
|
flights = find_table(body)
|
||||||
|
for page in range(2, pages + 1):
|
||||||
|
body = retrieve_from_inst(page)
|
||||||
|
flights.extend(find_table(body))
|
||||||
|
|
||||||
for f in flights:
|
for f in flights:
|
||||||
print(get_details(f, aux_data=aux_data, debug=os.environ.get("DEBUG") is not None))
|
print(get_details(f, aux_data=aux_data))
|
||||||
|
|
19795
devloop/sample3.html
Normal file
19795
devloop/sample3.html
Normal file
File diff suppressed because one or more lines are too long
|
@ -1,22 +1,25 @@
|
||||||
# -*- encoding: utf-8 -*-
|
# -*- encoding: utf-8 -*-
|
||||||
import os
|
|
||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
from latecomers.parse import find_table, get_details
|
from latecomers.parse import find_table, get_details
|
||||||
from latecomers.serializer import to_excel
|
from latecomers.serializer import to_excel
|
||||||
|
|
||||||
with open("./devloop/sample.html") as f:
|
with open("./devloop/sample3.html") as f:
|
||||||
content = f.read()
|
content = f.read()
|
||||||
|
|
||||||
flights = find_table(content)
|
flights = find_table(content)
|
||||||
data = []
|
data = []
|
||||||
|
|
||||||
for f in flights:
|
for f in flights:
|
||||||
data.append(get_details(f, os.environ.get("DEBUG") is not None))
|
data.append(get_details(f))
|
||||||
|
|
||||||
print(to_excel(data))
|
print(to_excel(data))
|
||||||
|
|
||||||
colonne = {"th_arrival": "Arrivo teorico", "real_arrival": "Arrivo reale",
|
colonne = {
|
||||||
"code": "Codice volo", "origin": "Aeroporto di partenza", "status": "Stato"}
|
"th_arrival": "Arrivo teorico",
|
||||||
|
"real_arrival": "Arrivo reale",
|
||||||
|
"code": "Codice volo",
|
||||||
|
"origin": "Aeroporto di partenza",
|
||||||
|
"status": "Stato",
|
||||||
|
}
|
||||||
df = pd.DataFrame(data, columns=colonne)
|
df = pd.DataFrame(data, columns=colonne)
|
||||||
|
|
|
@ -4,7 +4,7 @@ import sys
|
||||||
import typing as T
|
import typing as T
|
||||||
|
|
||||||
from latecomers.retrieve import retrieve_from_inst, retrieve_from_fr24
|
from latecomers.retrieve import retrieve_from_inst, retrieve_from_fr24
|
||||||
from latecomers.parse import find_table, get_details, Details, parse_fr24
|
from latecomers.parse import count_pages, find_table, get_details, Details, parse_fr24
|
||||||
from latecomers.serializer import to_excel
|
from latecomers.serializer import to_excel
|
||||||
from latecomers.notifier import Notifier
|
from latecomers.notifier import Notifier
|
||||||
from latecomers.config import Config
|
from latecomers.config import Config
|
||||||
|
@ -23,11 +23,19 @@ def main(config: Config):
|
||||||
The main cli entrypoint.
|
The main cli entrypoint.
|
||||||
"""
|
"""
|
||||||
out = Notifier(**config.smtp)
|
out = Notifier(**config.smtp)
|
||||||
|
|
||||||
body = retrieve_from_inst()
|
body = retrieve_from_inst()
|
||||||
|
pages = count_pages(body)
|
||||||
|
logger.info(f"found all data in {pages} page(s)")
|
||||||
table = find_table(body)
|
table = find_table(body)
|
||||||
|
for page in range(2, pages + 1):
|
||||||
|
body = retrieve_from_inst(page)
|
||||||
|
table.extend(find_table(body))
|
||||||
|
|
||||||
fr24_data = retrieve_from_fr24()
|
fr24_data = retrieve_from_fr24()
|
||||||
aux_data = parse_fr24(fr24_data)
|
aux_data = parse_fr24(fr24_data)
|
||||||
data: T.List[Details] = []
|
data: T.List[Details] = []
|
||||||
|
|
||||||
for row in table:
|
for row in table:
|
||||||
data.append(get_details(row, aux_data))
|
data.append(get_details(row, aux_data))
|
||||||
|
|
||||||
|
|
|
@ -20,26 +20,47 @@ logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
def not_empty(obj: et._Element) -> bool:
|
def not_empty(obj: et._Element) -> bool:
|
||||||
if type(obj) is et._Element:
|
if type(obj) is et._Element:
|
||||||
children = len(obj.xpath(".//h5"))
|
if "lfr-template" in obj.attrib.get("class"):
|
||||||
return children == 5 or children == 6
|
return False
|
||||||
|
children = len(obj.xpath(".//td"))
|
||||||
|
return children in (5, 6)
|
||||||
|
|
||||||
raise RuntimeError(f"provided argument is of unsupported type: {type(obj)}")
|
raise RuntimeError(f"provided argument is of unsupported type: {type(obj)}")
|
||||||
|
|
||||||
|
|
||||||
|
@logit(logger)
|
||||||
|
def count_pages(html_content: T.Text) -> int:
|
||||||
|
"""
|
||||||
|
Count how many pages there are to be accessed
|
||||||
|
"""
|
||||||
|
root = et.fromstring(html_content, parser=PARSER)
|
||||||
|
il_items = root.xpath(
|
||||||
|
"//div[contains(@data-qa-id, 'paginator')]/ul[contains(@class, 'pagination')]/li"
|
||||||
|
)
|
||||||
|
if not il_items:
|
||||||
|
return 1
|
||||||
|
|
||||||
|
return len(il_items) - 2
|
||||||
|
|
||||||
|
|
||||||
@logit(logger)
|
@logit(logger)
|
||||||
def find_table(html_content: T.Text) -> T.List[et._ElementTree]:
|
def find_table(html_content: T.Text) -> T.List[et._ElementTree]:
|
||||||
"""
|
"""
|
||||||
Find the table that holds the data in the html response
|
Find the table that holds the data in the html response
|
||||||
"""
|
"""
|
||||||
root = et.fromstring(html_content, parser=PARSER)
|
root = et.fromstring(html_content, parser=PARSER)
|
||||||
tbody = root.xpath("//tbody[contains(@class, 'table-data')]")
|
tbody = root.xpath("//tbody")
|
||||||
if not tbody:
|
if not tbody:
|
||||||
return []
|
return []
|
||||||
|
|
||||||
if len(tbody) != 1:
|
if len(tbody) != 1:
|
||||||
raise ValueError(f"Unexpected parsing result: found {len(tbody)} results")
|
raise ValueError(f"Unexpected parsing result: found {len(tbody)} results")
|
||||||
|
|
||||||
result = [child for child in tbody[0].xpath(".//tr") if not_empty(child)]
|
result = [
|
||||||
|
child
|
||||||
|
for child in tbody[0].xpath(".//tr[contains(@data-qa-id, 'row')]")
|
||||||
|
if not_empty(child)
|
||||||
|
]
|
||||||
|
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
@ -74,56 +95,73 @@ class Details(object):
|
||||||
status: Status = Status.UNKNOWN
|
status: Status = Status.UNKNOWN
|
||||||
fr24_landing_time: T.Optional[T.Text] = None
|
fr24_landing_time: T.Optional[T.Text] = None
|
||||||
|
|
||||||
def maybe_parse_hour_th(self, h5: et._ElementTree) -> None:
|
def __init__(self, row: et._ElementTree) -> None:
|
||||||
|
self.row = row
|
||||||
|
|
||||||
|
def maybe_parse_hour_th(self) -> None:
|
||||||
"""
|
"""
|
||||||
This function fills the fileds related to the theoric arrival hour,
|
This function fills the fileds related to the theoric arrival hour,
|
||||||
if the input matches some heuristics.
|
if the input matches some heuristics.
|
||||||
"""
|
"""
|
||||||
hour = TIME_RE.findall(h5.text)
|
hour = self.row.xpath(".//span[contains(@class, 'date-estimated__time')]")
|
||||||
if len(hour) == 1:
|
if len(hour) != 1:
|
||||||
self.th_arrival = hour[0]
|
logger.debug("Cannot parse estimated time")
|
||||||
if "text-decoration: line-through" in h5.attrib.get("style", ""):
|
return
|
||||||
self.real_arrival = None
|
self.th_arrival = hour[0].text
|
||||||
|
|
||||||
def maybe_parse_hour_real(self, h5: et._ElementTree) -> None:
|
def maybe_parse_hour_real(self) -> None:
|
||||||
"""
|
"""
|
||||||
This function fills the fileds related to the theoric arrival hour,
|
This function fills the fileds related to the theoric arrival hour,
|
||||||
if the input matches some heuristics.
|
if the input matches some heuristics.
|
||||||
"""
|
"""
|
||||||
hour = TIME_RE.findall(h5.text)
|
hour = self.row.xpath(".//span[contains(@class, 'date-actual__time')]")
|
||||||
if len(hour) == 1:
|
if len(hour) != 1:
|
||||||
self.real_arrival = hour[0]
|
logger.debug("Cannot parse actual time")
|
||||||
|
return
|
||||||
|
self.real_arrival = hour[0].text
|
||||||
|
|
||||||
def maybe_parse_code(self, h5: et._ElementTree) -> None:
|
def maybe_parse_code(self) -> None:
|
||||||
"""
|
"""
|
||||||
This function fills the fileds related to the flight code,
|
This function fills the fileds related to the flight code,
|
||||||
if present and the input matches some heuristics.
|
if present and the input matches some heuristics.
|
||||||
"""
|
"""
|
||||||
if "flight-numb" not in h5.attrib.get("class", ""):
|
code = self.row.xpath(
|
||||||
|
".//td[contains(@class, 'lfr-departure-column-column')]//div/a/strong"
|
||||||
|
)
|
||||||
|
if len(code) != 1:
|
||||||
|
logger.debug("Cannot parse code")
|
||||||
return
|
return
|
||||||
child = h5.xpath(".//strong")
|
self.code = code[0].text.strip("\t\n ").replace(" ", "")
|
||||||
if len(child) == 1:
|
|
||||||
self.code = child[0].text.strip("\t\n ").replace(" ", "")
|
|
||||||
|
|
||||||
def maybe_parse_airport(self, h5: et._ElementTree) -> None:
|
def maybe_parse_airport(self) -> None:
|
||||||
"""
|
"""
|
||||||
This function fills the field for the airport, if the input matches some
|
This function fills the field for the airport, if the input matches some
|
||||||
heuristics.
|
heuristics.
|
||||||
"""
|
"""
|
||||||
airport = h5.text.strip("\t\n")
|
airport = self.row.xpath(
|
||||||
if len(airport) > 0 and "blue-title" in h5.attrib.get("class", ""):
|
".//td[contains(@class, 'lfr-flight-departure-column')]/h5"
|
||||||
self.origin = airport
|
)
|
||||||
|
if len(airport) != 1:
|
||||||
|
logger.debug("Cannot parse airport")
|
||||||
|
return
|
||||||
|
self.origin = airport[0].text.strip("\t\n")
|
||||||
|
|
||||||
def maybe_parse_status(self, h5: et._ElementTree) -> None:
|
def maybe_parse_status(self) -> None:
|
||||||
"""
|
"""
|
||||||
This function fills the filed for the status, if the input matches some
|
This function fills the filed for the status, if the input matches some
|
||||||
heuristics.
|
heuristics.
|
||||||
"""
|
"""
|
||||||
_class = h5.attrib.get("class", "")
|
status = self.row.xpath(
|
||||||
|
".//td[contains(@class, 'lfr-flight-status-column')]/h5"
|
||||||
|
)
|
||||||
|
if len(status) != 1:
|
||||||
|
logger.debug("Cannot parse status")
|
||||||
|
return
|
||||||
|
_class = status[0].attrib.get("class", "")
|
||||||
if "arrivato" in _class or "schedulato" in _class:
|
if "arrivato" in _class or "schedulato" in _class:
|
||||||
self.status = Status.from_str(h5.text)
|
self.status = Status.from_str(status[0].text)
|
||||||
else:
|
else:
|
||||||
parsed = STATUS_RE.findall(h5.text)
|
parsed = STATUS_RE.findall(status[0].text)
|
||||||
if len(parsed) == 1:
|
if len(parsed) == 1:
|
||||||
self.status = Status.from_str(parsed[0])
|
self.status = Status.from_str(parsed[0])
|
||||||
|
|
||||||
|
@ -133,6 +171,7 @@ class Details(object):
|
||||||
only FlightRadar24 data).
|
only FlightRadar24 data).
|
||||||
"""
|
"""
|
||||||
if not self.code:
|
if not self.code:
|
||||||
|
logger.debug("Cannot add aux data: missing code")
|
||||||
return
|
return
|
||||||
|
|
||||||
self.fr24_landing_time = aux_data.get(self.code)
|
self.fr24_landing_time = aux_data.get(self.code)
|
||||||
|
@ -156,34 +195,18 @@ class Details(object):
|
||||||
def get_details(
|
def get_details(
|
||||||
table_entry: et._ElementTree,
|
table_entry: et._ElementTree,
|
||||||
aux_data: T.Optional[T.Dict[T.Text, T.Text]] = None,
|
aux_data: T.Optional[T.Dict[T.Text, T.Text]] = None,
|
||||||
debug: bool = False,
|
|
||||||
) -> Details:
|
) -> Details:
|
||||||
"""
|
"""
|
||||||
Find the dates in a table row. If a strikenthrough time is found, it is
|
Find the dates in a table row. If a strikenthrough time is found, it is
|
||||||
returned as second element in the tuple.
|
returned as second element in the tuple.
|
||||||
"""
|
"""
|
||||||
res = table_entry.xpath(".//h5")
|
d = Details(table_entry)
|
||||||
if len(res) > 6:
|
|
||||||
raise ValueError(f"Unexpected number of h5 found in line: {len(res)}")
|
|
||||||
|
|
||||||
if debug:
|
d.maybe_parse_hour_th()
|
||||||
for r in res:
|
d.maybe_parse_hour_real()
|
||||||
txt = r.text.strip("\t\n ")
|
d.maybe_parse_code()
|
||||||
print(f"[DEBUG] text={txt} attrs={r.attrib}")
|
d.maybe_parse_airport()
|
||||||
|
d.maybe_parse_status()
|
||||||
d = Details()
|
|
||||||
|
|
||||||
if len(res) == 5:
|
|
||||||
d.maybe_parse_hour_th(res[0])
|
|
||||||
d.maybe_parse_code(res[1])
|
|
||||||
d.maybe_parse_airport(res[2])
|
|
||||||
d.maybe_parse_status(res[3])
|
|
||||||
elif len(res) == 6:
|
|
||||||
d.maybe_parse_hour_th(res[0])
|
|
||||||
d.maybe_parse_hour_real(res[1])
|
|
||||||
d.maybe_parse_code(res[2])
|
|
||||||
d.maybe_parse_airport(res[3])
|
|
||||||
d.maybe_parse_status(res[4])
|
|
||||||
|
|
||||||
if aux_data:
|
if aux_data:
|
||||||
d.maybe_add_aux_data(aux_data)
|
d.maybe_add_aux_data(aux_data)
|
||||||
|
@ -192,7 +215,7 @@ def get_details(
|
||||||
|
|
||||||
|
|
||||||
def parse_fr24(
|
def parse_fr24(
|
||||||
data: T.Optional[T.Dict[T.Text, T.Any]]
|
data: T.Optional[T.Dict[T.Text, T.Any]],
|
||||||
) -> T.Optional[T.Dict[T.Text, T.Text]]:
|
) -> T.Optional[T.Dict[T.Text, T.Text]]:
|
||||||
"""
|
"""
|
||||||
This function parses the given FlightRadar24 data into a pandas DataFrame.
|
This function parses the given FlightRadar24 data into a pandas DataFrame.
|
||||||
|
@ -206,9 +229,7 @@ def parse_fr24(
|
||||||
results = {}
|
results = {}
|
||||||
for flight in data["result"]["response"]["airport"]["pluginData"]["schedule"][
|
for flight in data["result"]["response"]["airport"]["pluginData"]["schedule"][
|
||||||
"arrivals"
|
"arrivals"
|
||||||
][
|
]["data"]: # noqa: E501
|
||||||
"data"
|
|
||||||
]: # noqa: E501
|
|
||||||
try:
|
try:
|
||||||
id_num = flight["flight"]["identification"]["number"]
|
id_num = flight["flight"]["identification"]["number"]
|
||||||
if _code := id_num.get("default"):
|
if _code := id_num.get("default"):
|
||||||
|
|
File diff suppressed because one or more lines are too long
|
@ -1,6 +1,6 @@
|
||||||
[project]
|
[project]
|
||||||
name = "latecomers"
|
name = "latecomers"
|
||||||
version = "0.3.3"
|
version = "0.5.0"
|
||||||
description = "Retrieve and save data from ADR Ciampino airport"
|
description = "Retrieve and save data from ADR Ciampino airport"
|
||||||
authors = [{name="Leonardo Barcaroli", email="blallo@autistici.org"}]
|
authors = [{name="Leonardo Barcaroli", email="blallo@autistici.org"}]
|
||||||
license = {text="Public Domain"}
|
license = {text="Public Domain"}
|
||||||
|
@ -24,6 +24,7 @@ dev = [
|
||||||
"ipython",
|
"ipython",
|
||||||
"black",
|
"black",
|
||||||
"build",
|
"build",
|
||||||
|
"ipdb",
|
||||||
]
|
]
|
||||||
|
|
||||||
[build-system]
|
[build-system]
|
||||||
|
|
80
requirements.dev.txt
Normal file
80
requirements.dev.txt
Normal file
|
@ -0,0 +1,80 @@
|
||||||
|
# This file was autogenerated by uv v0.1.3 via the following command:
|
||||||
|
# uv pip compile --all-extras pyproject.toml -o requirements.dev.txt
|
||||||
|
asttokens==2.4.1
|
||||||
|
# via stack-data
|
||||||
|
black==24.2.0
|
||||||
|
build==1.0.3
|
||||||
|
certifi==2024.2.2
|
||||||
|
# via requests
|
||||||
|
charset-normalizer==3.3.2
|
||||||
|
# via requests
|
||||||
|
click==8.1.7
|
||||||
|
# via black
|
||||||
|
decorator==5.1.1
|
||||||
|
# via
|
||||||
|
# ipdb
|
||||||
|
# ipython
|
||||||
|
et-xmlfile==1.1.0
|
||||||
|
# via openpyxl
|
||||||
|
executing==2.0.1
|
||||||
|
# via stack-data
|
||||||
|
idna==3.6
|
||||||
|
# via requests
|
||||||
|
ipdb==0.13.13
|
||||||
|
ipython==8.21.0
|
||||||
|
# via ipdb
|
||||||
|
jedi==0.19.1
|
||||||
|
# via ipython
|
||||||
|
lxml==5.1.0
|
||||||
|
matplotlib-inline==0.1.6
|
||||||
|
# via ipython
|
||||||
|
mypy-extensions==1.0.0
|
||||||
|
# via black
|
||||||
|
numpy==1.26.4
|
||||||
|
# via pandas
|
||||||
|
openpyxl==3.1.2
|
||||||
|
packaging==23.2
|
||||||
|
# via
|
||||||
|
# black
|
||||||
|
# build
|
||||||
|
pandas==2.2.0
|
||||||
|
parso==0.8.3
|
||||||
|
# via jedi
|
||||||
|
pathspec==0.12.1
|
||||||
|
# via black
|
||||||
|
pexpect==4.9.0
|
||||||
|
# via ipython
|
||||||
|
platformdirs==4.2.0
|
||||||
|
# via black
|
||||||
|
prompt-toolkit==3.0.43
|
||||||
|
# via ipython
|
||||||
|
ptyprocess==0.7.0
|
||||||
|
# via pexpect
|
||||||
|
pure-eval==0.2.2
|
||||||
|
# via stack-data
|
||||||
|
pygments==2.17.2
|
||||||
|
# via ipython
|
||||||
|
pyproject-hooks==1.0.0
|
||||||
|
# via build
|
||||||
|
python-dateutil==2.8.2
|
||||||
|
# via pandas
|
||||||
|
pytz==2024.1
|
||||||
|
# via pandas
|
||||||
|
pyyaml==6.0.1
|
||||||
|
requests==2.31.0
|
||||||
|
six==1.16.0
|
||||||
|
# via
|
||||||
|
# asttokens
|
||||||
|
# python-dateutil
|
||||||
|
stack-data==0.6.3
|
||||||
|
# via ipython
|
||||||
|
traitlets==5.14.1
|
||||||
|
# via
|
||||||
|
# ipython
|
||||||
|
# matplotlib-inline
|
||||||
|
tzdata==2024.1
|
||||||
|
# via pandas
|
||||||
|
urllib3==2.2.0
|
||||||
|
# via requests
|
||||||
|
wcwidth==0.2.13
|
||||||
|
# via prompt-toolkit
|
Loading…
Reference in New Issue
Block a user