rick-and-morty-downloader-py/scraper.py

import requests, json, ConfigParser, os
from bs4 import BeautifulSoup
from pprint import pprint


class Scraper():
    # url = 'http://www.viaggiatreno.it/viaggiatrenonew/resteasy/viaggiatreno/partenze/S01480/Tue%20Oct%2011%202017%2008:30:00%20GMT+0200%20(CEST)'

    def __init__(self):
        config = ConfigParser.RawConfigParser()
        config.read('rickandmorty.conf')

        # getfloat() raises an exception if the value is not a float
        # getint() and getboolean() also do this for their respective types
        self.url = config.get('Main', 'url')

        pprint(self.url)

        # config.read(['site.cfg', os.path.expanduser('~/.rickandmorty.conf')])
        self.load_page()
        pass

    def calc_res(self, resolution):
        if resolution is None:
            return 0
        vals = resolution.split('x')
        if (len(vals) < 2):
            return 0
        pprint(vals)
        _ret = int(vals[0]) * int(vals[1])
        return _ret

    def load_page(self):
        url = self.url % (1, 1)
        r = requests.get(url)
        # pprint(content)
        pprint(url)
        soup = BeautifulSoup(r.text, 'html.parser')
        player = soup.find(id="player")
        frameUrl = player.get('data-src').strip()

        r = requests.get(frameUrl)
        # pprint(content)
        pprint(url)
        soup = BeautifulSoup(r.text, 'html.parser')
        videoResolutions = [x.get('res') for x in soup.find_all(name="source")]
        videoSources = [x.get('src') for x in soup.find_all(name="source")]
        pprint(videoSources)
        pprint(videoResolutions)

        videoUrls = zip(videoSources, videoResolutions)
        topRes = 0
        curTop = videoUrls[0][0]
        for video in videoUrls:
            if (self.calc_res(video[1]) > topRes):
                topRes = self.calc_res(video[1])
                curTop = video[0]
        url = curTop

        # frameUrl = player.get('data-src').strip()

        print "downloading with requests"
        local_filename = "s01e01.mp4"
        # NOTE the stream=True parameter
        r = requests.get(url, stream=True)
        with open(local_filename, 'wb') as f:
            for chunk in r.iter_content(chunk_size=1024):
                if chunk:  # filter out keep-alive new chunks
                    print('writing chunk...\n')
                    f.write(chunk)
                    # f.flush() commented by recommendation from J.F.Sebastian
        # r = requests.get(url)
        # with open("s01e01.mp4", "wb") as code:
        #     code.write(r.content)
        # pprint(url)
        # pprint(frameUrl)
        # iframe#player


def parse_stations(self, stations):
    _ret = []
    for station in stations.split('\n'):
        if len(station) > 0:
            _ret.append(self.parse_station(station))
    return _ret


def parse_station(self, station):
    stat = station.split('|')
    return {
        'name': stat[0].strip(),
        'code': stat[1].strip()
    }


def find_stations(self, station_name, stations):
    _ret = []
    for station in stations:
        if station_name.lower() in station['name'].lower():
            _ret.append(station)
    return _ret


if __name__ == '__main__':
    scraper = Scraper()

    stations = scraper.get_stations('elenco_stazioni.txt')
    pprint(stations)
Initial commit 2017-10-21 03:51:37 +02:00			`import requests, json, ConfigParser, os`
			`from bs4 import BeautifulSoup`
			`from pprint import pprint`


			`class Scraper():`
			`# url = 'http://www.viaggiatreno.it/viaggiatrenonew/resteasy/viaggiatreno/partenze/S01480/Tue%20Oct%2011%202017%2008:30:00%20GMT+0200%20(CEST)'`

			`def __init__(self):`
			`config = ConfigParser.RawConfigParser()`
			`config.read('rickandmorty.conf')`

			`# getfloat() raises an exception if the value is not a float`
			`# getint() and getboolean() also do this for their respective types`
			`self.url = config.get('Main', 'url')`

			`pprint(self.url)`

			`# config.read(['site.cfg', os.path.expanduser('~/.rickandmorty.conf')])`
			`self.load_page()`
			`pass`

			`def calc_res(self, resolution):`
			`if resolution is None:`
			`return 0`
			`vals = resolution.split('x')`
			`if (len(vals) < 2):`
			`return 0`
			`pprint(vals)`
			`_ret = int(vals[0]) * int(vals[1])`
			`return _ret`

			`def load_page(self):`
			`url = self.url % (1, 1)`
			`r = requests.get(url)`
			`# pprint(content)`
			`pprint(url)`
			`soup = BeautifulSoup(r.text, 'html.parser')`
			`player = soup.find(id="player")`
			`frameUrl = player.get('data-src').strip()`

			`r = requests.get(frameUrl)`
			`# pprint(content)`
			`pprint(url)`
			`soup = BeautifulSoup(r.text, 'html.parser')`
			`videoResolutions = [x.get('res') for x in soup.find_all(name="source")]`
			`videoSources = [x.get('src') for x in soup.find_all(name="source")]`
			`pprint(videoSources)`
			`pprint(videoResolutions)`

			`videoUrls = zip(videoSources, videoResolutions)`
			`topRes = 0`
			`curTop = videoUrls[0][0]`
			`for video in videoUrls:`
			`if (self.calc_res(video[1]) > topRes):`
			`topRes = self.calc_res(video[1])`
			`curTop = video[0]`
			`url = curTop`

			`# frameUrl = player.get('data-src').strip()`

			`print "downloading with requests"`
			`local_filename = "s01e01.mp4"`
			`# NOTE the stream=True parameter`
			`r = requests.get(url, stream=True)`
			`with open(local_filename, 'wb') as f:`
			`for chunk in r.iter_content(chunk_size=1024):`
			`if chunk: # filter out keep-alive new chunks`
			`print('writing chunk...\n')`
			`f.write(chunk)`
			`# f.flush() commented by recommendation from J.F.Sebastian`
			`# r = requests.get(url)`
			`# with open("s01e01.mp4", "wb") as code:`
			`# code.write(r.content)`
			`# pprint(url)`
			`# pprint(frameUrl)`
			`# iframe#player`


			`def parse_stations(self, stations):`
			`_ret = []`
			`for station in stations.split('\n'):`
			`if len(station) > 0:`
			`_ret.append(self.parse_station(station))`
			`return _ret`


			`def parse_station(self, station):`
			`stat = station.split('\|')`
			`return {`
			`'name': stat[0].strip(),`
			`'code': stat[1].strip()`
			`}`


			`def find_stations(self, station_name, stations):`
			`_ret = []`
			`for station in stations:`
			`if station_name.lower() in station['name'].lower():`
			`_ret.append(station)`
			`return _ret`


			`if __name__ == '__main__':`
			`scraper = Scraper()`

			`stations = scraper.get_stations('elenco_stazioni.txt')`
			`pprint(stations)`