Bump to v0.5.0

Iterate over pages
Remove empty template line
2024-05-06 01:11:10 +02:00 · 2024-05-06 01:10:43 +02:00 · 2024-05-06 01:10:23 +02:00 · 2024-05-05 23:29:12 +02:00 · 2024-02-17 16:34:08 +01:00 · 2024-02-17 16:33:52 +01:00
10 changed files with 19995 additions and 80 deletions
--- a/devloop/parse.py
+++ b/devloop/parse.py
@ -1,12 +1,10 @@
 # -*- encoding: utf-8 -*-
-import os
-
 from latecomers.parse import find_table, get_details

-with open("./devloop/sample.html") as f:
+with open("./devloop/sample3.html") as f:
    content = f.read()

 flights = find_table(content)

 for f in flights:
-    print(get_details(f, os.environ.get("DEBUG") is not None))
+    print(get_details(f))
--- a/devloop/parse_retrieve.py
+++ b/devloop/parse_retrieve.py
@ -2,14 +2,17 @@
 import os

 from latecomers.retrieve import retrieve_from_inst, retrieve_from_fr24
-from latecomers.parse import find_table, get_details, parse_fr24
-
-body = retrieve_from_inst()
-
-flights = find_table(body)
-
-aux_data = parse_fr24(retrieve_from_fr24())
+from latecomers.parse import count_pages, find_table, get_details, parse_fr24

 breakpoint()
+aux_data = parse_fr24(retrieve_from_fr24())
+
+body = retrieve_from_inst()
+pages = count_pages(body)
+flights = find_table(body)
+for page in range(2, pages + 1):
+    body = retrieve_from_inst(page)
+    flights.extend(find_table(body))
+
 for f in flights:
-    print(get_details(f, aux_data=aux_data, debug=os.environ.get("DEBUG") is not None))
+    print(get_details(f, aux_data=aux_data))
--- a/devloop/sample3.html
+++ b/devloop/sample3.html
--- a/devloop/serialize.py
+++ b/devloop/serialize.py
@ -1,22 +1,25 @@
 # -*- encoding: utf-8 -*-
-import os
-
 import pandas as pd

 from latecomers.parse import find_table, get_details
 from latecomers.serializer import to_excel

-with open("./devloop/sample.html") as f:
+with open("./devloop/sample3.html") as f:
    content = f.read()

 flights = find_table(content)
 data = []

 for f in flights:
-    data.append(get_details(f, os.environ.get("DEBUG") is not None))
+    data.append(get_details(f))

 print(to_excel(data))

-colonne = {"th_arrival": "Arrivo teorico", "real_arrival": "Arrivo reale",
-           "code": "Codice volo", "origin": "Aeroporto di partenza", "status": "Stato"}
+colonne = {
+    "th_arrival": "Arrivo teorico",
+    "real_arrival": "Arrivo reale",
+    "code": "Codice volo",
+    "origin": "Aeroporto di partenza",
+    "status": "Stato",
+}
 df = pd.DataFrame(data, columns=colonne)
--- a/latecomers/main.py
+++ b/latecomers/main.py
@ -4,7 +4,7 @@ import sys
 import typing as T

 from latecomers.retrieve import retrieve_from_inst, retrieve_from_fr24
-from latecomers.parse import find_table, get_details, Details, parse_fr24
+from latecomers.parse import count_pages, find_table, get_details, Details, parse_fr24
 from latecomers.serializer import to_excel
 from latecomers.notifier import Notifier
 from latecomers.config import Config
@ -23,11 +23,19 @@ def main(config: Config):
    The main cli entrypoint.
    """
    out = Notifier(**config.smtp)
+
    body = retrieve_from_inst()
+    pages = count_pages(body)
+    logger.info(f"found all data in {pages} page(s)")
    table = find_table(body)
+    for page in range(2, pages + 1):
+        body = retrieve_from_inst(page)
+        table.extend(find_table(body))
+
    fr24_data = retrieve_from_fr24()
    aux_data = parse_fr24(fr24_data)
    data: T.List[Details] = []
+
    for row in table:
        data.append(get_details(row, aux_data))

--- a/latecomers/notifier.py
+++ b/latecomers/notifier.py
@ -84,7 +84,10 @@ class Notifier(object):

        email = message.as_string()

-        self.send(to, email)
+        rcpt = to
+        rcpt.extend(cc)
+
+        self.send(rcpt, email)

    @retry_and_log(logger, RETRIES)
    def send_no_data(self, to: T.List[T.Text], cc: T.List[T.Text]) -> None:
@ -107,4 +110,7 @@ il vostro tecnico preferito.
        message.attach(MIMEText(body, "plain"))
        email = message.as_string()

-        self.send(to, email)
+        rcpt = to
+        rcpt.extend(cc)
+
+        self.send(rcpt, email)
--- a/latecomers/parse.py
+++ b/latecomers/parse.py
@ -20,26 +20,47 @@ logger = logging.getLogger(__name__)

 def not_empty(obj: et._Element) -> bool:
    if type(obj) is et._Element:
-        children = len(obj.xpath(".//h5"))
-        return children == 5 or children == 6
+        if "lfr-template" in obj.attrib.get("class"):
+            return False
+        children = len(obj.xpath(".//td"))
+        return children in (5, 6)

    raise RuntimeError(f"provided argument is of unsupported type: {type(obj)}")


+@logit(logger)
+def count_pages(html_content: T.Text) -> int:
+    """
+    Count how many pages there are to be accessed
+    """
+    root = et.fromstring(html_content, parser=PARSER)
+    il_items = root.xpath(
+        "//div[contains(@data-qa-id, 'paginator')]/ul[contains(@class, 'pagination')]/li"
+    )
+    if not il_items:
+        return 1
+
+    return len(il_items) - 2
+
+
@logit(logger)
 def find_table(html_content: T.Text) -> T.List[et._ElementTree]:
    """
    Find the table that holds the data in the html response
    """
    root = et.fromstring(html_content, parser=PARSER)
-    tbody = root.xpath("//tbody[contains(@class, 'table-data')]")
+    tbody = root.xpath("//tbody")
    if not tbody:
        return []

    if len(tbody) != 1:
        raise ValueError(f"Unexpected parsing result: found {len(tbody)} results")

-    result = [child for child in tbody[0].xpath(".//tr") if not_empty(child)]
+    result = [
+        child
+        for child in tbody[0].xpath(".//tr[contains(@data-qa-id, 'row')]")
+        if not_empty(child)
+    ]

    return result

@ -74,56 +95,73 @@ class Details(object):
    status: Status = Status.UNKNOWN
    fr24_landing_time: T.Optional[T.Text] = None

-    def maybe_parse_hour_th(self, h5: et._ElementTree) -> None:
+    def __init__(self, row: et._ElementTree) -> None:
+        self.row = row
+
+    def maybe_parse_hour_th(self) -> None:
        """
        This function fills the fileds related to the theoric arrival hour,
        if the input matches some heuristics.
        """
-        hour = TIME_RE.findall(h5.text)
-        if len(hour) == 1:
-            self.th_arrival = hour[0]
-            if "text-decoration: line-through" in h5.attrib.get("style", ""):
-                self.real_arrival = None
+        hour = self.row.xpath(".//span[contains(@class, 'date-estimated__time')]")
+        if len(hour) != 1:
+            logger.debug("Cannot parse estimated time")
+            return
+        self.th_arrival = hour[0].text

-    def maybe_parse_hour_real(self, h5: et._ElementTree) -> None:
+    def maybe_parse_hour_real(self) -> None:
        """
        This function fills the fileds related to the theoric arrival hour,
        if the input matches some heuristics.
        """
-        hour = TIME_RE.findall(h5.text)
-        if len(hour) == 1:
-            self.real_arrival = hour[0]
+        hour = self.row.xpath(".//span[contains(@class, 'date-actual__time')]")
+        if len(hour) != 1:
+            logger.debug("Cannot parse actual time")
+            return
+        self.real_arrival = hour[0].text

-    def maybe_parse_code(self, h5: et._ElementTree) -> None:
+    def maybe_parse_code(self) -> None:
        """
        This function fills the fileds related to the flight code,
        if present and the input matches some heuristics.
        """
-        if "flight-numb" not in h5.attrib.get("class", ""):
+        code = self.row.xpath(
+            ".//td[contains(@class, 'lfr-departure-column-column')]//div/a/strong"
+        )
+        if len(code) != 1:
+            logger.debug("Cannot parse code")
            return
-        child = h5.xpath(".//strong")
-        if len(child) == 1:
-            self.code = child[0].text.strip("\t\n ").replace(" ", "")
+        self.code = code[0].text.strip("\t\n ").replace(" ", "")

-    def maybe_parse_airport(self, h5: et._ElementTree) -> None:
+    def maybe_parse_airport(self) -> None:
        """
        This function fills the field for the airport, if the input matches some
        heuristics.
        """
-        airport = h5.text.strip("\t\n")
-        if len(airport) > 0 and "blue-title" in h5.attrib.get("class", ""):
-            self.origin = airport
+        airport = self.row.xpath(
+            ".//td[contains(@class, 'lfr-flight-departure-column')]/h5"
+        )
+        if len(airport) != 1:
+            logger.debug("Cannot parse airport")
+            return
+        self.origin = airport[0].text.strip("\t\n")

-    def maybe_parse_status(self, h5: et._ElementTree) -> None:
+    def maybe_parse_status(self) -> None:
        """
        This function fills the filed for the status, if the input matches some
        heuristics.
        """
-        _class = h5.attrib.get("class", "")
+        status = self.row.xpath(
+            ".//td[contains(@class, 'lfr-flight-status-column')]/h5"
+        )
+        if len(status) != 1:
+            logger.debug("Cannot parse status")
+            return
+        _class = status[0].attrib.get("class", "")
        if "arrivato" in _class or "schedulato" in _class:
-            self.status = Status.from_str(h5.text)
+            self.status = Status.from_str(status[0].text)
        else:
-            parsed = STATUS_RE.findall(h5.text)
+            parsed = STATUS_RE.findall(status[0].text)
            if len(parsed) == 1:
                self.status = Status.from_str(parsed[0])

@ -133,6 +171,7 @@ class Details(object):
        only FlightRadar24 data).
        """
        if not self.code:
+            logger.debug("Cannot add aux data: missing code")
            return

        self.fr24_landing_time = aux_data.get(self.code)
@ -156,34 +195,18 @@ class Details(object):
 def get_details(
    table_entry: et._ElementTree,
    aux_data: T.Optional[T.Dict[T.Text, T.Text]] = None,
-    debug: bool = False,
 ) -> Details:
    """
    Find the dates in a table row. If a strikenthrough time is found, it is
    returned as second element in the tuple.
    """
-    res = table_entry.xpath(".//h5")
-    if len(res) > 6:
-        raise ValueError(f"Unexpected number of h5 found in line: {len(res)}")
+    d = Details(table_entry)

-    if debug:
-        for r in res:
-            txt = r.text.strip("\t\n ")
-            print(f"[DEBUG] text={txt} attrs={r.attrib}")
-
-    d = Details()
-
-    if len(res) == 5:
-        d.maybe_parse_hour_th(res[0])
-        d.maybe_parse_code(res[1])
-        d.maybe_parse_airport(res[2])
-        d.maybe_parse_status(res[3])
-    elif len(res) == 6:
-        d.maybe_parse_hour_th(res[0])
-        d.maybe_parse_hour_real(res[1])
-        d.maybe_parse_code(res[2])
-        d.maybe_parse_airport(res[3])
-        d.maybe_parse_status(res[4])
+    d.maybe_parse_hour_th()
+    d.maybe_parse_hour_real()
+    d.maybe_parse_code()
+    d.maybe_parse_airport()
+    d.maybe_parse_status()

    if aux_data:
        d.maybe_add_aux_data(aux_data)
@ -192,7 +215,7 @@ def get_details(


 def parse_fr24(
-    data: T.Optional[T.Dict[T.Text, T.Any]]
+    data: T.Optional[T.Dict[T.Text, T.Any]],
 ) -> T.Optional[T.Dict[T.Text, T.Text]]:
    """
    This function parses the given FlightRadar24 data into a pandas DataFrame.
@ -206,9 +229,7 @@ def parse_fr24(
        results = {}
        for flight in data["result"]["response"]["airport"]["pluginData"]["schedule"][
            "arrivals"
-        ][
-            "data"
-        ]:  # noqa: E501
+        ]["data"]:  # noqa: E501
            try:
                id_num = flight["flight"]["identification"]["number"]
                if _code := id_num.get("default"):
--- a/latecomers/retrieve.py
+++ b/latecomers/retrieve.py
--- a/pyproject.toml
+++ b/pyproject.toml
@ -1,6 +1,6 @@
 [project]
 name = "latecomers"
-version = "0.3.2"
+version = "0.5.0"
 description = "Retrieve and save data from ADR Ciampino airport"
 authors = [{name="Leonardo Barcaroli", email="blallo@autistici.org"}]
 license = {text="Public Domain"}
@ -24,6 +24,7 @@ dev = [
    "ipython",
    "black",
    "build",
+    "ipdb",
 ]

 [build-system]
--- a/requirements.dev.txt
+++ b/requirements.dev.txt
@ -0,0 +1,80 @@
+# This file was autogenerated by uv v0.1.3 via the following command:
+#    uv pip compile --all-extras pyproject.toml -o requirements.dev.txt
+asttokens==2.4.1
+    # via stack-data
+black==24.2.0
+build==1.0.3
+certifi==2024.2.2
+    # via requests
+charset-normalizer==3.3.2
+    # via requests
+click==8.1.7
+    # via black
+decorator==5.1.1
+    # via
+    #   ipdb
+    #   ipython
+et-xmlfile==1.1.0
+    # via openpyxl
+executing==2.0.1
+    # via stack-data
+idna==3.6
+    # via requests
+ipdb==0.13.13
+ipython==8.21.0
+    # via ipdb
+jedi==0.19.1
+    # via ipython
+lxml==5.1.0
+matplotlib-inline==0.1.6
+    # via ipython
+mypy-extensions==1.0.0
+    # via black
+numpy==1.26.4
+    # via pandas
+openpyxl==3.1.2
+packaging==23.2
+    # via
+    #   black
+    #   build
+pandas==2.2.0
+parso==0.8.3
+    # via jedi
+pathspec==0.12.1
+    # via black
+pexpect==4.9.0
+    # via ipython
+platformdirs==4.2.0
+    # via black
+prompt-toolkit==3.0.43
+    # via ipython
+ptyprocess==0.7.0
+    # via pexpect
+pure-eval==0.2.2
+    # via stack-data
+pygments==2.17.2
+    # via ipython
+pyproject-hooks==1.0.0
+    # via build
+python-dateutil==2.8.2
+    # via pandas
+pytz==2024.1
+    # via pandas
+pyyaml==6.0.1
+requests==2.31.0
+six==1.16.0
+    # via
+    #   asttokens
+    #   python-dateutil
+stack-data==0.6.3
+    # via ipython
+traitlets==5.14.1
+    # via
+    #   ipython
+    #   matplotlib-inline
+tzdata==2024.1
+    # via pandas
+urllib3==2.2.0
+    # via requests
+wcwidth==0.2.13
+    # via prompt-toolkit
Author	SHA1	Message	Date
Blallo	b9d89c3f0c	Bump to v0.5.0	2024-05-06 01:11:10 +02:00
Blallo	4f67b1c97a	Iterate over pages	2024-05-06 01:10:43 +02:00
Blallo	6e56f90b6b	Remove empty template line	2024-05-06 01:10:23 +02:00
Blallo	5405efbbc4	Bump to v0.4.0	2024-05-05 23:29:12 +02:00
Blallo	789091f7a8	Use uv machinery and add ipdb	2024-02-17 16:34:08 +01:00
Blallo	a8b321a47d	Improve devloop	2024-02-17 16:33:52 +01:00
Blallo	31f2636dd8	Fix parsing	2024-02-17 16:33:34 +01:00
Blallo	fa0023d2d1	Bump to v0.3.3	2022-09-27 23:16:11 +02:00
Blallo	4d1596c4a2	Make cc REALLY work	2022-09-27 23:15:50 +02:00