rick-and-morty-downloader-py/scraper.py

#!/usr/bin/env python

import requests
import ConfigParser
import time
from bs4 import BeautifulSoup


class Scraper:
    name = 'Rick and Morty py'
    author = 'NotIsSet'
    license = 'GPLv2'
    year = '2017'
    version = '0.0.1'

    out_buffer = ''
    out_buffer_time = 0
    out_buffer_refresh_threshold = 0

    @staticmethod
    def bytes_to_multiples(b):
        divisor = 1
        unit = 'b'

        if b >= 1000000000:
            unit = 'gb'
            divisor = 1000000000
        elif b >= 1000000:
            unit = 'mb'
            divisor = 1000000
        elif b >= 1000:
            unit = 'kb'
            divisor = 1000

        _ret = round((float(b) / divisor), 2)
        if _ret - int(_ret) == 0.0:
            _ret = int(_ret)
        _ret = str(_ret) + unit
        return _ret

    @staticmethod
    def get_speed_string(bps):
        return Scraper.bytes_to_multiples(bps) + '/s'

    @staticmethod
    def get_completion(written, total):
        out_percentage = round(((float(written * 100)) / total), 2)
        if out_percentage - int(out_percentage) == 0.0:
            out_percentage = int(out_percentage)

        return '%s%% - %s/%s' % (
            out_percentage,
            Scraper.bytes_to_multiples(written),
            Scraper.bytes_to_multiples(total)
        )

    @staticmethod
    def calc_res(resolution):
        if resolution is None:
            return 0
        factors = resolution.split('x')
        if len(factors) < 2:
            return 0
        _ret = int(factors[0]) * int(factors[1])
        return _ret

    @staticmethod
    def get_top_resolution(resolutions):
        top_res = 0
        _ret = resolutions[0][0]
        for video in resolutions:
            if Scraper.calc_res(video[1]) > top_res:
                top_res = Scraper.calc_res(video[1])
                _ret = video[0]
        return _ret

    def buffer_out(self, string, concatenate=True):
        current_time = int(round(time.time() * 1000))
        out = self.out_buffer + '\n' + string
        if concatenate:
            self.out_buffer = out

        if current_time - self.out_buffer_time > self.out_buffer_refresh_threshold:
            self.out_buffer_time = current_time
            print '\x1b[2J' + out

    def __init__(self):
        self.info_header = '\n'.join([
            'name:  ' + self.name,
            'author:  ' + self.author,
            'license:  ' + self.license,
            'year:  ' + self.year,
            'version:  ' + self.version,
        ])

        config = ConfigParser.RawConfigParser()
        config.read('rickandmorty.conf')
        self.url = config.get('Main', 'url')

        self.headers = requests.utils.default_headers()
        self.headers.update({"User-Agent": "Mozilla/5.0"})

    def get_episode_url(self, season, episode):
        url = self.url % (season, episode)
        r = requests.get(url, headers=self.headers)
        # self.buffer_out(url)
        soup = BeautifulSoup(r.text, 'html.parser')
        player = soup.find(id="player")
        if player is None:
            return None
        frame_url = player.get('data-src')
        if player is None:
            return None
        frame_url = frame_url.strip()
        # self.buffer_out(frame_url)
        r = requests.get(frame_url, headers=self.headers)
        soup = BeautifulSoup(r.text, 'html.parser')

        video_resolutions = [x.get('res') for x in soup.find_all(name="source")]
        video_sources = [x.get('src') for x in soup.find_all(name="source")]

        video_urls = zip(video_sources, video_resolutions)

        return Scraper.get_top_resolution(video_urls)

    def download_file(self, url, destination):
        r = requests.get(url, stream=True, headers=self.headers)
        written_chunks = 0

        last_time = time.time()

        with open(destination, 'wb') as f:
            total_length = int(r.headers['Content-Length'])
            speed_buffer = []
            speed_buffer_size = 1000

            for chunk in r.iter_content(chunk_size=1024):
                if chunk:  # filter out keep-alive new chunks
                    time_diff = time.time() - last_time
                    last_time = time.time()

                    speed_buffer.append(float(1024 / time_diff / 10))
                    if len(speed_buffer) > speed_buffer_size:
                        speed_buffer.pop(0)

                    f.write(chunk)
                    written_chunks += 1

                    completion = Scraper.get_completion(written_chunks * 1000, total_length / 1024 * 1000)
                    speed = Scraper.get_speed_string(float(sum(speed_buffer)) / speed_buffer_size)

                    if written_chunks % 20 == 0:
                        self.buffer_out(
                            'Download in progress...\n%s - %s\n' % (
                                completion, speed),
                            False)
            self.buffer_out('Download complete')

    def run(self, download=False):
        self.buffer_out(self.info_header)
        self.buffer_out(self.url)
        for season in range(1, 4):
            for episode in range(1, 12):
                ep_url = self.get_episode_url(season, episode)
                if ep_url is None:
                    self.buffer_out('season %s episode %s - WARN could not retrieve url' % (season, episode))
                    continue
                self.buffer_out('season %s episode %s - url: %s' % (season, episode, ep_url))
                if download:
                    destination = "s%se%s.mp4" % (season, episode)
                    self.download_file(ep_url, destination)
        pass


if __name__ == '__main__':
    scraper = Scraper()
    scraper.run()
download status, exception handling 2017-11-06 00:50:44 +01:00			`#!/usr/bin/env python`
Initial commit 2017-10-21 03:51:37 +02:00
download status, exception handling 2017-11-06 00:50:44 +01:00			`import requests`
			`import ConfigParser`
			`import time`
			`from bs4 import BeautifulSoup`
Initial commit 2017-10-21 03:51:37 +02:00

download status, exception handling 2017-11-06 00:50:44 +01:00			`class Scraper:`
			`name = 'Rick and Morty py'`
			`author = 'NotIsSet'`
			`license = 'GPLv2'`
			`year = '2017'`
			`version = '0.0.1'`

			`out_buffer = ''`
			`out_buffer_time = 0`
			`out_buffer_refresh_threshold = 0`

			`@staticmethod`
			`def bytes_to_multiples(b):`
			`divisor = 1`
			`unit = 'b'`

			`if b >= 1000000000:`
			`unit = 'gb'`
			`divisor = 1000000000`
			`elif b >= 1000000:`
			`unit = 'mb'`
			`divisor = 1000000`
			`elif b >= 1000:`
			`unit = 'kb'`
			`divisor = 1000`

			`_ret = round((float(b) / divisor), 2)`
			`if _ret - int(_ret) == 0.0:`
			`_ret = int(_ret)`
			`_ret = str(_ret) + unit`
			`return _ret`
Initial commit 2017-10-21 03:51:37 +02:00
download status, exception handling 2017-11-06 00:50:44 +01:00			`@staticmethod`
			`def get_speed_string(bps):`
			`return Scraper.bytes_to_multiples(bps) + '/s'`
Initial commit 2017-10-21 03:51:37 +02:00
download status, exception handling 2017-11-06 00:50:44 +01:00			`@staticmethod`
			`def get_completion(written, total):`
			`out_percentage = round(((float(written * 100)) / total), 2)`
			`if out_percentage - int(out_percentage) == 0.0:`
			`out_percentage = int(out_percentage)`
Initial commit 2017-10-21 03:51:37 +02:00
download status, exception handling 2017-11-06 00:50:44 +01:00			`return '%s%% - %s/%s' % (`
			`out_percentage,`
			`Scraper.bytes_to_multiples(written),`
			`Scraper.bytes_to_multiples(total)`
			`)`
Initial commit 2017-10-21 03:51:37 +02:00
download status, exception handling 2017-11-06 00:50:44 +01:00			`@staticmethod`
			`def calc_res(resolution):`
Initial commit 2017-10-21 03:51:37 +02:00			`if resolution is None:`
			`return 0`
download status, exception handling 2017-11-06 00:50:44 +01:00			`factors = resolution.split('x')`
			`if len(factors) < 2:`
Initial commit 2017-10-21 03:51:37 +02:00			`return 0`
download status, exception handling 2017-11-06 00:50:44 +01:00			`_ret = int(factors[0]) * int(factors[1])`
Initial commit 2017-10-21 03:51:37 +02:00			`return _ret`

download status, exception handling 2017-11-06 00:50:44 +01:00			`@staticmethod`
			`def get_top_resolution(resolutions):`
			`top_res = 0`
			`_ret = resolutions[0][0]`
			`for video in resolutions:`
			`if Scraper.calc_res(video[1]) > top_res:`
			`top_res = Scraper.calc_res(video[1])`
			`_ret = video[0]`
			`return _ret`

			`def buffer_out(self, string, concatenate=True):`
			`current_time = int(round(time.time() * 1000))`
			`out = self.out_buffer + '\n' + string`
			`if concatenate:`
			`self.out_buffer = out`

			`if current_time - self.out_buffer_time > self.out_buffer_refresh_threshold:`
			`self.out_buffer_time = current_time`
			`print '\x1b[2J' + out`

			`def __init__(self):`
			`self.info_header = '\n'.join([`
			`'name: ' + self.name,`
			`'author: ' + self.author,`
			`'license: ' + self.license,`
			`'year: ' + self.year,`
			`'version: ' + self.version,`
			`])`

			`config = ConfigParser.RawConfigParser()`
			`config.read('rickandmorty.conf')`
			`self.url = config.get('Main', 'url')`

			`self.headers = requests.utils.default_headers()`
			`self.headers.update({"User-Agent": "Mozilla/5.0"})`

			`def get_episode_url(self, season, episode):`
			`url = self.url % (season, episode)`
			`r = requests.get(url, headers=self.headers)`
			`# self.buffer_out(url)`
Initial commit 2017-10-21 03:51:37 +02:00			`soup = BeautifulSoup(r.text, 'html.parser')`
			`player = soup.find(id="player")`
download status, exception handling 2017-11-06 00:50:44 +01:00			`if player is None:`
			`return None`
			`frame_url = player.get('data-src')`
			`if player is None:`
			`return None`
			`frame_url = frame_url.strip()`
			`# self.buffer_out(frame_url)`
			`r = requests.get(frame_url, headers=self.headers)`
Initial commit 2017-10-21 03:51:37 +02:00			`soup = BeautifulSoup(r.text, 'html.parser')`

download status, exception handling 2017-11-06 00:50:44 +01:00			`video_resolutions = [x.get('res') for x in soup.find_all(name="source")]`
			`video_sources = [x.get('src') for x in soup.find_all(name="source")]`
Initial commit 2017-10-21 03:51:37 +02:00
download status, exception handling 2017-11-06 00:50:44 +01:00			`video_urls = zip(video_sources, video_resolutions)`
Initial commit 2017-10-21 03:51:37 +02:00
download status, exception handling 2017-11-06 00:50:44 +01:00			`return Scraper.get_top_resolution(video_urls)`
Initial commit 2017-10-21 03:51:37 +02:00
download status, exception handling 2017-11-06 00:50:44 +01:00			`def download_file(self, url, destination):`
			`r = requests.get(url, stream=True, headers=self.headers)`
			`written_chunks = 0`
Initial commit 2017-10-21 03:51:37 +02:00
download status, exception handling 2017-11-06 00:50:44 +01:00			`last_time = time.time()`
Initial commit 2017-10-21 03:51:37 +02:00
download status, exception handling 2017-11-06 00:50:44 +01:00			`with open(destination, 'wb') as f:`
			`total_length = int(r.headers['Content-Length'])`
			`speed_buffer = []`
			`speed_buffer_size = 1000`

			`for chunk in r.iter_content(chunk_size=1024):`
			`if chunk: # filter out keep-alive new chunks`
			`time_diff = time.time() - last_time`
			`last_time = time.time()`

			`speed_buffer.append(float(1024 / time_diff / 10))`
			`if len(speed_buffer) > speed_buffer_size:`
			`speed_buffer.pop(0)`

			`f.write(chunk)`
			`written_chunks += 1`

			`completion = Scraper.get_completion(written_chunks * 1000, total_length / 1024 * 1000)`
			`speed = Scraper.get_speed_string(float(sum(speed_buffer)) / speed_buffer_size)`

			`if written_chunks % 20 == 0:`
			`self.buffer_out(`
			`'Download in progress...\n%s - %s\n' % (`
			`completion, speed),`
			`False)`
			`self.buffer_out('Download complete')`

			`def run(self, download=False):`
			`self.buffer_out(self.info_header)`
			`self.buffer_out(self.url)`
			`for season in range(1, 4):`
			`for episode in range(1, 12):`
			`ep_url = self.get_episode_url(season, episode)`
			`if ep_url is None:`
			`self.buffer_out('season %s episode %s - WARN could not retrieve url' % (season, episode))`
			`continue`
			`self.buffer_out('season %s episode %s - url: %s' % (season, episode, ep_url))`
			`if download:`
			`destination = "s%se%s.mp4" % (season, episode)`
			`self.download_file(ep_url, destination)`
			`pass`
Initial commit 2017-10-21 03:51:37 +02:00

			`if __name__ == '__main__':`
			`scraper = Scraper()`
download status, exception handling 2017-11-06 00:50:44 +01:00			`scraper.run()`