#!/usr/bin/env python import requests import ConfigParser import time from bs4 import BeautifulSoup class Scraper: name = 'Rick and Morty py' author = 'NotIsSet' license = 'GPLv2' year = '2017' version = '0.0.1' out_buffer = '' out_buffer_time = 0 out_buffer_refresh_threshold = 0 @staticmethod def bytes_to_multiples(b): divisor = 1 unit = 'b' if b >= 1000000000: unit = 'gb' divisor = 1000000000 elif b >= 1000000: unit = 'mb' divisor = 1000000 elif b >= 1000: unit = 'kb' divisor = 1000 _ret = round((float(b) / divisor), 2) if _ret - int(_ret) == 0.0: _ret = int(_ret) _ret = str(_ret) + unit return _ret @staticmethod def get_speed_string(bps): return Scraper.bytes_to_multiples(bps) + '/s' @staticmethod def get_completion(written, total): out_percentage = round(((float(written * 100)) / total), 2) if out_percentage - int(out_percentage) == 0.0: out_percentage = int(out_percentage) return '%s%% - %s/%s' % ( out_percentage, Scraper.bytes_to_multiples(written), Scraper.bytes_to_multiples(total) ) @staticmethod def calc_res(resolution): if resolution is None: return 0 factors = resolution.split('x') if len(factors) < 2: return 0 _ret = int(factors[0]) * int(factors[1]) return _ret @staticmethod def get_top_resolution(resolutions): top_res = 0 _ret = resolutions[0][0] for video in resolutions: if Scraper.calc_res(video[1]) > top_res: top_res = Scraper.calc_res(video[1]) _ret = video[0] return _ret def buffer_out(self, string, concatenate=True): current_time = int(round(time.time() * 1000)) out = self.out_buffer + '\n' + string if concatenate: self.out_buffer = out if current_time - self.out_buffer_time > self.out_buffer_refresh_threshold: self.out_buffer_time = current_time print '\x1b[2J' + out def __init__(self): self.info_header = '\n'.join([ 'name: ' + self.name, 'author: ' + self.author, 'license: ' + self.license, 'year: ' + self.year, 'version: ' + self.version, ]) config = ConfigParser.RawConfigParser() config.read('rickandmorty.conf') self.url = config.get('Main', 'url') self.headers = requests.utils.default_headers() self.headers.update({"User-Agent": "Mozilla/5.0"}) def get_episode_url(self, season, episode): url = self.url % (season, episode) r = requests.get(url, headers=self.headers) # self.buffer_out(url) soup = BeautifulSoup(r.text, 'html.parser') player = soup.find(id="player") if player is None: return None frame_url = player.get('data-src') if player is None: return None frame_url = frame_url.strip() # self.buffer_out(frame_url) r = requests.get(frame_url, headers=self.headers) soup = BeautifulSoup(r.text, 'html.parser') video_resolutions = [x.get('res') for x in soup.find_all(name="source")] video_sources = [x.get('src') for x in soup.find_all(name="source")] video_urls = zip(video_sources, video_resolutions) return Scraper.get_top_resolution(video_urls) def download_file(self, url, destination): r = requests.get(url, stream=True, headers=self.headers) written_chunks = 0 last_time = time.time() with open(destination, 'wb') as f: total_length = int(r.headers['Content-Length']) speed_buffer = [] speed_buffer_size = 1000 for chunk in r.iter_content(chunk_size=1024): if chunk: # filter out keep-alive new chunks time_diff = time.time() - last_time last_time = time.time() speed_buffer.append(float(1024 / time_diff / 10)) if len(speed_buffer) > speed_buffer_size: speed_buffer.pop(0) f.write(chunk) written_chunks += 1 completion = Scraper.get_completion(written_chunks * 1000, total_length / 1024 * 1000) speed = Scraper.get_speed_string(float(sum(speed_buffer)) / speed_buffer_size) if written_chunks % 20 == 0: self.buffer_out( 'Download in progress...\n%s - %s\n' % ( completion, speed), False) self.buffer_out('Download complete') def run(self, download=False): self.buffer_out(self.info_header) self.buffer_out(self.url) for season in range(1, 4): for episode in range(1, 12): ep_url = self.get_episode_url(season, episode) if ep_url is None: self.buffer_out('season %s episode %s - WARN could not retrieve url' % (season, episode)) continue self.buffer_out('season %s episode %s - url: %s' % (season, episode, ep_url)) if download: destination = "s%se%s.mp4" % (season, episode) self.download_file(ep_url, destination) pass if __name__ == '__main__': scraper = Scraper() scraper.run()