From 4f67b1c97a2b55240aa4051c605294145148525d Mon Sep 17 00:00:00 2001 From: Blallo Date: Mon, 6 May 2024 01:10:43 +0200 Subject: [PATCH] Iterate over pages --- devloop/parse_retrieve.py | 19 +++++++++++-------- latecomers/main.py | 10 +++++++++- latecomers/parse.py | 27 ++++++++++++++++++++++++--- latecomers/retrieve.py | 10 +++++----- 4 files changed, 49 insertions(+), 17 deletions(-) diff --git a/devloop/parse_retrieve.py b/devloop/parse_retrieve.py index ddf30ae..fef44ee 100644 --- a/devloop/parse_retrieve.py +++ b/devloop/parse_retrieve.py @@ -2,14 +2,17 @@ import os from latecomers.retrieve import retrieve_from_inst, retrieve_from_fr24 -from latecomers.parse import find_table, get_details, parse_fr24 - -body = retrieve_from_inst() - -flights = find_table(body) - -aux_data = parse_fr24(retrieve_from_fr24()) +from latecomers.parse import count_pages, find_table, get_details, parse_fr24 breakpoint() +aux_data = parse_fr24(retrieve_from_fr24()) + +body = retrieve_from_inst() +pages = count_pages(body) +flights = find_table(body) +for page in range(2, pages + 1): + body = retrieve_from_inst(page) + flights.extend(find_table(body)) + for f in flights: - print(get_details(f, aux_data=aux_data, debug=os.environ.get("DEBUG") is not None)) + print(get_details(f, aux_data=aux_data)) diff --git a/latecomers/main.py b/latecomers/main.py index f113df8..398446a 100644 --- a/latecomers/main.py +++ b/latecomers/main.py @@ -4,7 +4,7 @@ import sys import typing as T from latecomers.retrieve import retrieve_from_inst, retrieve_from_fr24 -from latecomers.parse import find_table, get_details, Details, parse_fr24 +from latecomers.parse import count_pages, find_table, get_details, Details, parse_fr24 from latecomers.serializer import to_excel from latecomers.notifier import Notifier from latecomers.config import Config @@ -23,11 +23,19 @@ def main(config: Config): The main cli entrypoint. """ out = Notifier(**config.smtp) + body = retrieve_from_inst() + pages = count_pages(body) + logger.info(f"found all data in {pages} page(s)") table = find_table(body) + for page in range(2, pages + 1): + body = retrieve_from_inst(page) + table.extend(find_table(body)) + fr24_data = retrieve_from_fr24() aux_data = parse_fr24(fr24_data) data: T.List[Details] = [] + for row in table: data.append(get_details(row, aux_data)) diff --git a/latecomers/parse.py b/latecomers/parse.py index 15c27d3..507e001 100644 --- a/latecomers/parse.py +++ b/latecomers/parse.py @@ -28,6 +28,21 @@ def not_empty(obj: et._Element) -> bool: raise RuntimeError(f"provided argument is of unsupported type: {type(obj)}") +@logit(logger) +def count_pages(html_content: T.Text) -> int: + """ + Count how many pages there are to be accessed + """ + root = et.fromstring(html_content, parser=PARSER) + il_items = root.xpath( + "//div[contains(@data-qa-id, 'paginator')]/ul[contains(@class, 'pagination')]/li" + ) + if not il_items: + return 1 + + return len(il_items) - 2 + + @logit(logger) def find_table(html_content: T.Text) -> T.List[et._ElementTree]: """ @@ -110,7 +125,9 @@ class Details(object): This function fills the fileds related to the flight code, if present and the input matches some heuristics. """ - code = self.row.xpath(".//td[contains(@class, 'lfr-departure-column-column')]//div/a/strong") + code = self.row.xpath( + ".//td[contains(@class, 'lfr-departure-column-column')]//div/a/strong" + ) if len(code) != 1: logger.debug("Cannot parse code") return @@ -121,7 +138,9 @@ class Details(object): This function fills the field for the airport, if the input matches some heuristics. """ - airport = self.row.xpath(".//td[contains(@class, 'lfr-flight-departure-column')]/h5") + airport = self.row.xpath( + ".//td[contains(@class, 'lfr-flight-departure-column')]/h5" + ) if len(airport) != 1: logger.debug("Cannot parse airport") return @@ -132,7 +151,9 @@ class Details(object): This function fills the filed for the status, if the input matches some heuristics. """ - status = self.row.xpath(".//td[contains(@class, 'lfr-flight-status-column')]/h5") + status = self.row.xpath( + ".//td[contains(@class, 'lfr-flight-status-column')]/h5" + ) if len(status) != 1: logger.debug("Cannot parse status") return diff --git a/latecomers/retrieve.py b/latecomers/retrieve.py index 0f3f98a..111d6b3 100644 --- a/latecomers/retrieve.py +++ b/latecomers/retrieve.py @@ -14,12 +14,12 @@ HOW_MANY = 200 logger = logging.getLogger(__name__) -def remote_inst() -> T.Text: +def remote_inst(idx: int = 1) -> T.Text: """ - Returns the url to retrieve yesterday's data from institutional site. + Returns the url to retrieve yesterday's data from institutional site, provided a pagination index """ yesterday = get_date() - url = f"https://www.adr.it/pax-cia-voli-in-tempo-reale?p_p_id=3_WAR_realtimeflightsportlet&p_p_lifecycle=0&p_p_state=normal&p_p_mode=view&_3_WAR_realtimeflightsportlet_tab=arrival&_3_WAR_realtimeflightsportlet_codScaOpe=CIA&_3_WAR_realtimeflightsportlet_rouIata=&_3_WAR_realtimeflightsportlet_searchType=standard&_3_WAR_realtimeflightsportlet_airport=&_3_WAR_realtimeflightsportlet_date={yesterday}&_3_WAR_realtimeflightsportlet_orario=00:00-24:00&_3_WAR_realtimeflightsportlet_codVet=&_3_WAR_realtimeflightsportlet_carrier=&_3_WAR_realtimeflightsportlet_rtFlightsSearchContainerPrimaryKeys=&_3_WAR_realtimeflightsportlet_delta={HOW_MANY}" # noqa: E501 + url = f"https://www.adr.it/pax-cia-voli-in-tempo-reale?p_p_id=3_WAR_realtimeflightsportlet&p_p_lifecycle=0&p_p_state=normal&p_p_mode=view&_3_WAR_realtimeflightsportlet_tab=arrival&_3_WAR_realtimeflightsportlet_airport=&_3_WAR_realtimeflightsportlet_carrier=&_3_WAR_realtimeflightsportlet_codNat=&_3_WAR_realtimeflightsportlet_codScaOpe=CIA&_3_WAR_realtimeflightsportlet_codVet=&_3_WAR_realtimeflightsportlet_date={yesterday}&_3_WAR_realtimeflightsportlet_dataNumVol=&_3_WAR_realtimeflightsportlet_numVol=&_3_WAR_realtimeflightsportlet_rouIata=&_3_WAR_realtimeflightsportlet_orario=00:00-24:00&_3_WAR_realtimeflightsportlet_searchType=standard&_3_WAR_realtimeflightsportlet_isParent=false&_3_WAR_realtimeflightsportlet_airportId=0&_3_WAR_realtimeflightsportlet_orderByCol=comparationTime&_3_WAR_realtimeflightsportlet_orderByType=asc&_3_WAR_realtimeflightsportlet_resetCur=false&_3_WAR_realtimeflightsportlet_delta=20&_3_WAR_realtimeflightsportlet_cur={idx}" # noqa: E501 return url @@ -33,9 +33,9 @@ def remote_fr24() -> T.Text: @logit(logger) -def retrieve_from_inst() -> T.Text: +def retrieve_from_inst(idx: int = 1) -> T.Text: """This function retrieves the body from the website page""" - r = requests.get(remote_inst()) + r = requests.get(remote_inst(idx)) return r.text