openlamb/openlamb.py

#!/usr/bin/python3

import argparse
import traceback
import sys
import pandas as pd
import numpy as np
from sodapy import Socrata
import matplotlib.pyplot as plt
import re
import glob
import os
from os import getcwd, chdir

path_to_csv_files = "csv/"

datasets_ambiente = {"2020": "nicp-bhqi",
                     "2019": "kujm-kavy",
                     "2018": "bgqm-yq56",
                     "2017": "j8j8-qsb2"}

csv_ambiente = {"sensori_aria_1968-1995.zip": "puwt-3xxh",
                "sensori_aria_1996-2000.zip": "wabv-jucw",
                "sensori_aria_2001-2004.zip": "5jdj-7x8y",
                "sensori_aria_2005-2007.zip": "h3i4-wm93",
                "sensori_aria_2008-2010.zip": "wp2f-5nw6",
                "sensori_aria_2011.zip": "5mut-i45n",
                "sensori_aria_2012.zip": "wr4y-c9ti",
                "sensori_aria_2013.zip": "hsdm-3yhd",
                "sensori_aria_2014.zip": "69yc-isbh",
                "sensori_aria_2015.zip": "bpin-c7k8",
                "sensori_aria_2016.zip": "7v3n-37f3",
                "sensori_aria_2017.zip": "fdv6-2rbs",
                "sensori_aria_2018.zip": "4t9j-fd8z",
                "sensori_aria_2019.zip": "j2mz-aium"}


def _connect():
    client = Socrata("www.dati.lombardia.it", None)
    return client


def read_data_online(dataset, sensore):
    client = _connect()
    return client.get(dataset, IdSensore=sensore)


def read_data_from_csv(datafile):
    return pd.read_csv("csv/" + datafile, usecols=['IdSensore', 'Data', 'Valore', 'Stato', 'idOperatore'])


def process(dati, sensore, csv):
    """ processa i dati per un sensore da un dataset o un file csv e restituisce un dataframe """
    print('Sto processando i dati del sensore %s per l\'origine dati %s...' % (sensore, dati))
    if csv:
        results = read_data_from_csv(dati)
    else:
        results = read_data_online(dati, sensore)
    results_df = pd.DataFrame.from_records(results)
    results_df.columns = [x.lower() for x in results_df.columns]
    try:
        results_df = results_df.astype({'idsensore': 'int64'})
        results_df = results_df[results_df['idsensore'] == int(sensore)]
        results_df = results_df.astype({'valore': 'float64'})
        results_df["data"] = pd.to_datetime(results_df["data"])
        results_df = results_df.replace(-9999, np.nan)
    except:
        print('\nERRORE: dati non disponibili per il sensore %s\n') % sensore
        traceback.print_exc()
        sys.exit(-1)
    results_df.sort_values(by=['data'], inplace=True)
    results_df.rename(columns={'valore': sensore}, inplace=True)
    results_df.drop(columns=['idoperatore', 'idsensore', 'stato'],
                    inplace=True)
    return results_df


def merge_df(dataframes, sensori):
    """ fonde diversi dataframes in un dataframe unico con un sensore per colonna """
    df = dataframes[sensori[0]]
    for sensore in sensori[1:]:
        df = pd.merge(df, dataframes[sensore])
    if len(df) == 0:
        print('\nERRORE: dati non disponibili per il sensore nel periodo considerato\n')
        sys.exit(-1)
    return df


def get_dataframes(dati_csv, dati, sensori):
    """ salva in un dict i dataframes dei vari sensori richiesti """
    dataframes = {}
    for sensore in sensori:
        if dati_csv:
            df = process(dati_csv[0], sensore, True)
            for d in dati_csv[1:]:
                df = pd.concat([df, process(d, sensore, True)], axis=0, ignore_index=True)
            df.rename(columns={sensore: sensore + "-csv"}, inplace=True)
            dataframes[sensore + "-csv"] = df
        if dati:
            df = process(dati[0], sensore, False)
            for d in dati[1:]:
                df = pd.concat([df, process(d, sensore, False)], axis=0, ignore_index=True)
            dataframes[sensore] = df
    return dataframes


def plot_dataframe(dataframe):
    dataframe.plot(x='data')
    plt.axhline(y=50, color='black', linestyle='-', label='EU limit')
    plt.show()


def list_of_csv_files(dir_name):
    saved = getcwd()
    os.chdir(dir_name)
    filelist = glob.glob('*.zip')
    chdir(saved)
    return filelist


def parse_range(x):
    x = x.strip()
    if x.isdigit():
        yield str(x)
    elif '-' in x:
        xr = x.split('-')
        yield from range(int(xr[0].strip()), int(xr[1].strip()) + 1)
    else:
        raise ValueError(f"Unknown range specified: {x}")


def get_csv_dict(dict):
    d = {}
    for (k, v) in dict.items():
        filename, id = k, v
        match_multi = re.search("\\d{4}-\\d{4}", filename)
        match_single = re.search("\\d{4}", filename)
        if match_multi:
            years = [str(x) for x in parse_range(str(match_multi.group()))]
        elif match_single:
            years = [match_single.group()]
        else:
            print("no match")
        for year in years:
            d.update({year: [filename, id]})
    return d


def check_csv(args, filelist, csv_dict):
    years = [str(x) for x in parse_range(args)]
    f = []
    for y in years:
        if y not in csv_dict.keys():
            print("Errore: i dati per l'anno %s non sono disponibili come csv" % y)
            sys.exit(-1)
        if csv_dict[y][0] not in filelist:
            print("file %s for year %s is not available in folder %s" % (csv_dict[y][0], y, path_to_csv_files))
            download_csv(csv_dict[y][0], csv_dict[y][1], path_to_csv_files)
        if csv_dict[y][0] not in f:
            f.append(csv_dict[y][0])
    return f


def download_csv(filename, id, path):
    print("downloading %s....... please wait" % filename)
    import requests
    url = "https://www.dati.lombardia.it/download/" + id + "/application%2Fzip"
    req = requests.get(url, allow_redirects=True)
    try:
        req.raise_for_status()
    except (requests.ConnectionError,
            requests.RequestException,
            requests.HTTPError,
            requests.Timeout,
            requests.TooManyRedirects) as e:
        print("Download error: \n\t %s" % str(e))
        sys.exit(-1)
    else:
        f = open(os.path.dirname(path) + "/" + filename, "wb")
        f.write(req.content)
        f.close()
        pass


def check_year_range(arg):
    """check if arg is a year or a year range"""
    if not re.search("\\d{4}-\\d{4}", arg):
        if not re.search("\\d{4}", arg):
            print("\nError: syntax for --csv and --dataset parameter: "
                  "NNNN single year or NNNN-NNNN for years range\n")
            sys.exit(-1)
    return True


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--dataset", nargs='+', required=False,
                        help="ricerca dei datasets")
    parser.add_argument("--csv", nargs='+', required=False,
                        help="ricerca nei files csv")
    parser.add_argument('--sensori', nargs='+', required=True,
                        help="cerca i dati di questi sensori")
    args = parser.parse_args()

    try:
        csv_dict = get_csv_dict(csv_ambiente)
        csv_files = list_of_csv_files(path_to_csv_files)
        dati_csv = []
        if args.csv:
            check_year_range(args.csv[0])
            dati_csv = check_csv(args.csv[0], csv_files, csv_dict)
        dati = []
        if args.dataset:
            if "all" in args.dataset:
                for k in datasets_ambiente.keys():
                    dati.append(datasets_ambiente[k])
            else:
                check_year_range(args.dataset[0])
                for d in parse_range(args.dataset[0]):  # args.dataset:
                    if datasets_ambiente[str(d)] not in dati:
                        dati.append(datasets_ambiente[str(d)])
        dataframes = get_dataframes(dati_csv, dati, args.sensori)
        datamerged = merge_df(dataframes, list(dataframes.keys()))
        datamerged.to_csv("export.csv")
        import stazioni
        s = stazioni.get_stazioni()
        for sensore in datamerged.columns[1:]:
            location = s.loc[s['idsensore'] == sensore.split("-")[0], 'nomestazione'].iloc[0]
            print('Valore medio per il sensore %s %s: %s' % (sensore, location, datamerged[sensore].mean().round(1)))
        plot_dataframe(datamerged)
    except KeyError:
        print("\nKeyError: forse hai specificato un dataset che non esiste ?\n"
              "i dataset sono disponibili per gli anni %s\n " % list(datasets_ambiente.keys()))
        traceback.print_exc()
    except KeyboardInterrupt:
        print("program terminated by user")
    except SystemExit:
        print("program terminated, bye")
    except:
        print("\nAn unhandled exception occured, here's the traceback!\n")
        traceback.print_exc()
        print("\nReport this to putro@autistici.org")
        sys.exit()


if __name__ == '__main__':
    main()
added csv download 2020-04-16 13:17:16 +02:00			`#!/usr/bin/python3`
initial repository creation 2020-04-05 19:21:02 +02:00
			`import argparse`
			`import traceback`
			`import sys`
			`import pandas as pd`
			`import numpy as np`
			`from sodapy import Socrata`
			`import matplotlib.pyplot as plt`
added csv download 2020-04-16 13:17:16 +02:00			`import re`
updated openlamb to read data online or from csv files 2020-04-12 14:08:38 +02:00			`import glob`
			`import os`
			`from os import getcwd, chdir`
initial repository creation 2020-04-05 19:21:02 +02:00
updated openlamb to read data online or from csv files 2020-04-12 14:08:38 +02:00			`path_to_csv_files = "csv/"`
initial repository creation 2020-04-05 19:21:02 +02:00
updated openlamb to read data online or from csv files 2020-04-12 14:08:38 +02:00			`datasets_ambiente = {"2020": "nicp-bhqi",`
			`"2019": "kujm-kavy",`
			`"2018": "bgqm-yq56",`
initial repository creation 2020-04-05 19:21:02 +02:00			`"2017": "j8j8-qsb2"}`

added csv download 2020-04-16 13:17:16 +02:00			`csv_ambiente = {"sensori_aria_1968-1995.zip": "puwt-3xxh",`
			`"sensori_aria_1996-2000.zip": "wabv-jucw",`
			`"sensori_aria_2001-2004.zip": "5jdj-7x8y",`
			`"sensori_aria_2005-2007.zip": "h3i4-wm93",`
			`"sensori_aria_2008-2010.zip": "wp2f-5nw6",`
			`"sensori_aria_2011.zip": "5mut-i45n",`
			`"sensori_aria_2012.zip": "wr4y-c9ti",`
			`"sensori_aria_2013.zip": "hsdm-3yhd",`
			`"sensori_aria_2014.zip": "69yc-isbh",`
			`"sensori_aria_2015.zip": "bpin-c7k8",`
			`"sensori_aria_2016.zip": "7v3n-37f3",`
			`"sensori_aria_2017.zip": "fdv6-2rbs",`
			`"sensori_aria_2018.zip": "4t9j-fd8z",`
			`"sensori_aria_2019.zip": "j2mz-aium"}`

initial repository creation 2020-04-05 19:21:02 +02:00
			`def _connect():`
			`client = Socrata("www.dati.lombardia.it", None)`
			`return client`

updated openlamb to read data online or from csv files 2020-04-12 14:08:38 +02:00
			`def read_data_online(dataset, sensore):`
initial repository creation 2020-04-05 19:21:02 +02:00			`client = _connect()`
updated openlamb to read data online or from csv files 2020-04-12 14:08:38 +02:00			`return client.get(dataset, IdSensore=sensore)`


			`def read_data_from_csv(datafile):`
			`return pd.read_csv("csv/" + datafile, usecols=['IdSensore', 'Data', 'Valore', 'Stato', 'idOperatore'])`


			`def process(dati, sensore, csv):`
			`""" processa i dati per un sensore da un dataset o un file csv e restituisce un dataframe """`
			`print('Sto processando i dati del sensore %s per l\'origine dati %s...' % (sensore, dati))`
			`if csv:`
			`results = read_data_from_csv(dati)`
			`else:`
			`results = read_data_online(dati, sensore)`
initial repository creation 2020-04-05 19:21:02 +02:00			`results_df = pd.DataFrame.from_records(results)`
updated openlamb to read data online or from csv files 2020-04-12 14:08:38 +02:00			`results_df.columns = [x.lower() for x in results_df.columns]`
initial repository creation 2020-04-05 19:21:02 +02:00			`try:`
updated openlamb to read data online or from csv files 2020-04-12 14:08:38 +02:00			`results_df = results_df.astype({'idsensore': 'int64'})`
			`results_df = results_df[results_df['idsensore'] == int(sensore)]`
initial repository creation 2020-04-05 19:21:02 +02:00			`results_df = results_df.astype({'valore': 'float64'})`
updated openlamb to read data online or from csv files 2020-04-12 14:08:38 +02:00			`results_df["data"] = pd.to_datetime(results_df["data"])`
			`results_df = results_df.replace(-9999, np.nan)`
initial repository creation 2020-04-05 19:21:02 +02:00			`except:`
updated openlamb to read data online or from csv files 2020-04-12 14:08:38 +02:00			`print('\nERRORE: dati non disponibili per il sensore %s\n') % sensore`
			`traceback.print_exc()`
initial repository creation 2020-04-05 19:21:02 +02:00			`sys.exit(-1)`
			`results_df.sort_values(by=['data'], inplace=True)`
updated openlamb to read data online or from csv files 2020-04-12 14:08:38 +02:00			`results_df.rename(columns={'valore': sensore}, inplace=True)`
			`results_df.drop(columns=['idoperatore', 'idsensore', 'stato'],`
			`inplace=True)`
initial repository creation 2020-04-05 19:21:02 +02:00			`return results_df`


			`def merge_df(dataframes, sensori):`
updated openlamb to read data online or from csv files 2020-04-12 14:08:38 +02:00			`""" fonde diversi dataframes in un dataframe unico con un sensore per colonna """`
initial repository creation 2020-04-05 19:21:02 +02:00			`df = dataframes[sensori[0]]`
			`for sensore in sensori[1:]:`
			`df = pd.merge(df, dataframes[sensore])`
better arguments handling, some fixes 2020-04-16 19:12:57 +02:00			`if len(df) == 0:`
			`print('\nERRORE: dati non disponibili per il sensore nel periodo considerato\n')`
			`sys.exit(-1)`
initial repository creation 2020-04-05 19:21:02 +02:00			`return df`


updated openlamb to read data online or from csv files 2020-04-12 14:08:38 +02:00			`def get_dataframes(dati_csv, dati, sensori):`
			`""" salva in un dict i dataframes dei vari sensori richiesti """`
initial repository creation 2020-04-05 19:21:02 +02:00			`dataframes = {}`
			`for sensore in sensori:`
updated openlamb to read data online or from csv files 2020-04-12 14:08:38 +02:00			`if dati_csv:`
			`df = process(dati_csv[0], sensore, True)`
			`for d in dati_csv[1:]:`
better arguments handling, some fixes 2020-04-16 19:12:57 +02:00			`df = pd.concat([df, process(d, sensore, True)], axis=0, ignore_index=True)`
updated openlamb to read data online or from csv files 2020-04-12 14:08:38 +02:00			`df.rename(columns={sensore: sensore + "-csv"}, inplace=True)`
			`dataframes[sensore + "-csv"] = df`
			`if dati:`
			`df = process(dati[0], sensore, False)`
			`for d in dati[1:]:`
better arguments handling, some fixes 2020-04-16 19:12:57 +02:00			`df = pd.concat([df, process(d, sensore, False)], axis=0, ignore_index=True)`
updated openlamb to read data online or from csv files 2020-04-12 14:08:38 +02:00			`dataframes[sensore] = df`
initial repository creation 2020-04-05 19:21:02 +02:00			`return dataframes`


			`def plot_dataframe(dataframe):`
			`dataframe.plot(x='data')`
updated openlamb to read data online or from csv files 2020-04-12 14:08:38 +02:00			`plt.axhline(y=50, color='black', linestyle='-', label='EU limit')`
initial repository creation 2020-04-05 19:21:02 +02:00			`plt.show()`


updated openlamb to read data online or from csv files 2020-04-12 14:08:38 +02:00			`def list_of_csv_files(dir_name):`
			`saved = getcwd()`
			`os.chdir(dir_name)`
read zipped csv files 2020-04-12 18:07:51 +02:00			`filelist = glob.glob('*.zip')`
updated openlamb to read data online or from csv files 2020-04-12 14:08:38 +02:00			`chdir(saved)`
			`return filelist`

initial repository creation 2020-04-05 19:21:02 +02:00
added csv download 2020-04-16 13:17:16 +02:00			`def parse_range(x):`
			`x = x.strip()`
			`if x.isdigit():`
			`yield str(x)`
			`elif '-' in x:`
			`xr = x.split('-')`
			`yield from range(int(xr[0].strip()), int(xr[1].strip()) + 1)`
			`else:`
			`raise ValueError(f"Unknown range specified: {x}")`


			`def get_csv_dict(dict):`
			`d = {}`
			`for (k, v) in dict.items():`
			`filename, id = k, v`
			`match_multi = re.search("\\d{4}-\\d{4}", filename)`
			`match_single = re.search("\\d{4}", filename)`
			`if match_multi:`
			`years = [str(x) for x in parse_range(str(match_multi.group()))]`
			`elif match_single:`
			`years = [match_single.group()]`
			`else:`
			`print("no match")`
			`for year in years:`
			`d.update({year: [filename, id]})`
			`return d`


			`def check_csv(args, filelist, csv_dict):`
			`years = [str(x) for x in parse_range(args)]`
			`f = []`
			`for y in years:`
			`if y not in csv_dict.keys():`
			`print("Errore: i dati per l'anno %s non sono disponibili come csv" % y)`
			`sys.exit(-1)`
			`if csv_dict[y][0] not in filelist:`
			`print("file %s for year %s is not available in folder %s" % (csv_dict[y][0], y, path_to_csv_files))`
			`download_csv(csv_dict[y][0], csv_dict[y][1], path_to_csv_files)`
better arguments handling, some fixes 2020-04-16 19:12:57 +02:00			`if csv_dict[y][0] not in f:`
			`f.append(csv_dict[y][0])`
added csv download 2020-04-16 13:17:16 +02:00			`return f`


			`def download_csv(filename, id, path):`
			`print("downloading %s....... please wait" % filename)`
			`import requests`
			`url = "https://www.dati.lombardia.it/download/" + id + "/application%2Fzip"`
			`req = requests.get(url, allow_redirects=True)`
			`try:`
			`req.raise_for_status()`
			`except (requests.ConnectionError,`
			`requests.RequestException,`
			`requests.HTTPError,`
			`requests.Timeout,`
			`requests.TooManyRedirects) as e:`
			`print("Download error: \n\t %s" % str(e))`
			`sys.exit(-1)`
			`else:`
			`f = open(os.path.dirname(path) + "/" + filename, "wb")`
			`f.write(req.content)`
			`f.close()`
			`pass`


better arguments handling, some fixes 2020-04-16 19:12:57 +02:00			`def check_year_range(arg):`
			`"""check if arg is a year or a year range"""`
			`if not re.search("\\d{4}-\\d{4}", arg):`
			`if not re.search("\\d{4}", arg):`
			`print("\nError: syntax for --csv and --dataset parameter: "`
			`"NNNN single year or NNNN-NNNN for years range\n")`
			`sys.exit(-1)`
			`return True`


updated openlamb to read data online or from csv files 2020-04-12 14:08:38 +02:00			`def main():`
initial repository creation 2020-04-05 19:21:02 +02:00			`parser = argparse.ArgumentParser()`
updated openlamb to read data online or from csv files 2020-04-12 14:08:38 +02:00			`parser.add_argument("--dataset", nargs='+', required=False,`
			`help="ricerca dei datasets")`
			`parser.add_argument("--csv", nargs='+', required=False,`
			`help="ricerca nei files csv")`
			`parser.add_argument('--sensori', nargs='+', required=True,`
			`help="cerca i dati di questi sensori")`
initial repository creation 2020-04-05 19:21:02 +02:00			`args = parser.parse_args()`

			`try:`
added csv download 2020-04-16 13:17:16 +02:00			`csv_dict = get_csv_dict(csv_ambiente)`
updated openlamb to read data online or from csv files 2020-04-12 14:08:38 +02:00			`csv_files = list_of_csv_files(path_to_csv_files)`
added csv download 2020-04-16 13:17:16 +02:00			`dati_csv = []`
updated openlamb to read data online or from csv files 2020-04-12 14:08:38 +02:00			`if args.csv:`
better arguments handling, some fixes 2020-04-16 19:12:57 +02:00			`check_year_range(args.csv[0])`
			`dati_csv = check_csv(args.csv[0], csv_files, csv_dict)`
updated openlamb to read data online or from csv files 2020-04-12 14:08:38 +02:00			`dati = []`
			`if args.dataset:`
			`if "all" in args.dataset:`
			`for k in datasets_ambiente.keys():`
			`dati.append(datasets_ambiente[k])`
			`else:`
better arguments handling, some fixes 2020-04-16 19:12:57 +02:00			`check_year_range(args.dataset[0])`
			`for d in parse_range(args.dataset[0]): # args.dataset:`
			`if datasets_ambiente[str(d)] not in dati:`
			`dati.append(datasets_ambiente[str(d)])`
updated openlamb to read data online or from csv files 2020-04-12 14:08:38 +02:00			`dataframes = get_dataframes(dati_csv, dati, args.sensori)`
added csv download 2020-04-16 13:17:16 +02:00			`datamerged = merge_df(dataframes, list(dataframes.keys()))`
			`datamerged.to_csv("export.csv")`
initial repository creation 2020-04-05 19:21:02 +02:00			`import stazioni`
			`s = stazioni.get_stazioni()`
updated openlamb to read data online or from csv files 2020-04-12 14:08:38 +02:00			`for sensore in datamerged.columns[1:]:`
			`location = s.loc[s['idsensore'] == sensore.split("-")[0], 'nomestazione'].iloc[0]`
initial repository creation 2020-04-05 19:21:02 +02:00			`print('Valore medio per il sensore %s %s: %s' % (sensore, location, datamerged[sensore].mean().round(1)))`
			`plot_dataframe(datamerged)`
			`except KeyError:`
better arguments handling, some fixes 2020-04-16 19:12:57 +02:00			`print("\nKeyError: forse hai specificato un dataset che non esiste ?\n"`
			`"i dataset sono disponibili per gli anni %s\n " % list(datasets_ambiente.keys()))`
updated openlamb to read data online or from csv files 2020-04-12 14:08:38 +02:00			`traceback.print_exc()`
initial repository creation 2020-04-05 19:21:02 +02:00			`except KeyboardInterrupt:`
			`print("program terminated by user")`
			`except SystemExit:`
			`print("program terminated, bye")`
			`except:`
			`print("\nAn unhandled exception occured, here's the traceback!\n")`
			`traceback.print_exc()`
			`print("\nReport this to putro@autistici.org")`
			`sys.exit()`

updated openlamb to read data online or from csv files 2020-04-12 14:08:38 +02:00
initial repository creation 2020-04-05 19:21:02 +02:00			`if __name__ == '__main__':`
			`main()`