This commit is contained in:
sfigato 2022-08-18 18:49:47 +02:00
commit 0986de8cb3
Signed by: blallo
GPG Key ID: 0CBE577C9B72DC3F
6 changed files with 37798 additions and 0 deletions

160
.gitignore vendored Normal file
View File

@ -0,0 +1,160 @@
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
.pybuilder/
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# poetry
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
# This is especially recommended for binary packages to ensure reproducibility, and is more
# commonly ignored for libraries.
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
#poetry.lock
# pdm
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
#pdm.lock
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
# in version control.
# https://pdm.fming.dev/#use-with-ide
.pdm.toml
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# pytype static type analyzer
.pytype/
# Cython debug symbols
cython_debug/
# PyCharm
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/

12
devloop/parse.py Normal file
View File

@ -0,0 +1,12 @@
# -*- encoding: utf-8 -*-
import os
from latecomers.parse import find_table, get_details
with open("./sample.html") as f:
content = f.read()
flights = find_table(content)
for f in flights:
print(get_details(f, os.environ.get("DEBUG") is not None))

19478
devloop/sample.html Normal file

File diff suppressed because one or more lines are too long

17945
devloop/sample2.html Normal file

File diff suppressed because one or more lines are too long

151
latecomers/parse.py Normal file
View File

@ -0,0 +1,151 @@
# -*- encoding: utf-8 -*-
from enum import Enum
import re
import typing as T
from lxml import etree as et
TIME_RE = re.compile(r"\d\d?:\d\d")
AIRPORT_RE = re.compile(r"[\w\d\s\S]+")
STATUS_RE = re.compile(r"(Arrivato|In Arrivo|Schedulato|Cancellato)")
PARSER = et.HTMLParser()
def not_empty(obj: et._Element) -> bool:
if type(obj) is et._Element:
children = len(obj.xpath(".//h5"))
return children == 5 or children == 6
raise RuntimeError(f"provided argument is of unsupported type: {type(obj)}")
def find_table(html_content: T.Text) -> T.List[et._ElementTree]:
"""
Find the table that holds the data in the html response
"""
root = et.fromstring(html_content, parser=PARSER)
tbody = root.xpath("//tbody[contains(@class, 'table-data')]")
if not tbody:
return []
if len(tbody) != 1:
raise ValueError(f"Unexpected parsing result: found {len(tbody)} results")
result = [child for child in tbody[0].xpath(".//tr") if not_empty(child)]
return result
class Status(Enum):
ARRIVED = "Arrivato"
ARRIVING = "In Arrivo"
SCHEDULED = "Schedulato"
CANCELED = "Cancellato"
UNKNOWN = "Sconosciuto"
@classmethod
def from_str(cls, text: T.Text) -> "Status":
if "Arrivato" in text:
return cls.ARRIVED
elif "In Arrivo" in text:
return cls.ARRIVING
elif "Schedulato" in text:
return cls.SCHEDULED
elif "Cancellato" in text:
return cls.CANCELED
else:
return cls.UNKNOWN
class Details(object):
th_arrival: T.Optional[T.Text] = None
real_arrival: T.Optional[T.Text] = None
code: T.Optional[T.Text] = None
origin: T.Optional[T.Text] = None
status: Status = Status.UNKNOWN
def maybe_parse_hour(self, h5: et._ElementTree) -> None:
"""
This function fills the fileds related to the arrival hour,
if the input matches some heuristics.
"""
hour = TIME_RE.findall(h5.text)
if len(hour) == 1:
if "text-decoration: line-through" in h5.attrib.get("style", ""):
self.th_arrival = hour[0]
else:
self.real_arrival = hour[0]
def maybe_parse_code(self, h5: et._ElementTree) -> None:
"""
This function fills the fileds related to the flight code,
if present and the input matches some heuristics.
"""
code = h5.text.strip("\t\n ")
if len(code) > 0 and "flight-numb" in h5.attrib.get("class", ""):
self.code = code
def maybe_parse_airport(self, h5: et._ElementTree) -> None:
"""
This function fills the field for the airport, if the input matches some
heuristics.
"""
airport = h5.text.strip("\t\n")
if len(airport) > 0 and "blue-title" in h5.attrib.get("class", ""):
self.origin = airport
def maybe_parse_status(self, h5: et._ElementTree) -> None:
"""
This function fills the filed for the status, if the input matches some
heuristics.
"""
_class = h5.attrib.get("class", "")
if "arrivato" in _class or "schedulato" in _class:
self.status = Status.from_str(h5.text)
else:
parsed = STATUS_RE.findall(h5.text)
if len(parsed) == 1:
self.status = Status.from_str(parsed[0])
def __str__(self) -> T.Text:
res: T.Dict[T.Text, T.Optional[T.Text]] = {}
if self.th_arrival:
res["theoric"] = self.th_arrival
res["real"] = self.real_arrival
if self.code:
res["code"] = self.code
res["origin"] = self.origin
res["status"] = self.status.value
desc = ",".join([f"{k}={v}" for k, v in res.items()])
return f"Detail<{desc}>"
def get_details(table_entry: et._ElementTree, debug: bool = False) -> Details:
"""
Find the dates in a table row. If a strikenthrough time is found, it is
returned as second element in the tuple.
"""
res = table_entry.xpath(".//h5")
if len(res) > 6:
raise ValueError(f"Unexpected number of h5 found in line: {len(res)}")
if debug:
for r in res:
txt = r.text.strip("\t\n ")
print(f"[DEBUG] text={txt} attrs={r.attrib}")
d = Details()
if len(res) == 5:
d.maybe_parse_hour(res[0])
d.maybe_parse_code(res[1])
d.maybe_parse_airport(res[2])
d.maybe_parse_status(res[3])
elif len(res) == 6:
d.maybe_parse_hour(res[0])
d.maybe_parse_hour(res[1])
d.maybe_parse_code(res[2])
d.maybe_parse_airport(res[3])
d.maybe_parse_status(res[4])
return d

52
pyproject.toml Normal file
View File

@ -0,0 +1,52 @@
[project]
name = "latecomers"
version = "0.1.0"
description = "Retrieve and save data from ADR Ciampino airport"
authors = [{name="Leonardo Barcaroli", email="blallo@autistici.org"}]
license = {text="Public Domain"}
requires-python = ">= 3.6"
dependencies = [
"requests",
"lxml",
"pandas",
]
[tool.setuptools.packages.find]
include = ["latecomers*"]
[project.optional-dependencies]
dev = [
"ipython",
"black",
]
[build-system]
requires = [
"setuptools",
]
build-backend = "setuptools.build_meta"
[tool.black]
line-length = 88
target_version = ['py36']
include = '\.pyi?$'
exclude = '''
(
/(
\.eggs # exclude a few common directories in the
| \.git # root of the project
| \.hg
| \.mypy_cache
| \.tox
| \.venv
| _build
| buck-out
| build
| dist
)
)
'''