2017-11-06 00:50:44 +01:00
|
|
|
#!/usr/bin/env python
|
2017-10-21 03:51:37 +02:00
|
|
|
|
2017-11-06 00:50:44 +01:00
|
|
|
import requests
|
|
|
|
import ConfigParser
|
|
|
|
import time
|
|
|
|
from bs4 import BeautifulSoup
|
2017-10-21 03:51:37 +02:00
|
|
|
|
|
|
|
|
2017-11-06 00:50:44 +01:00
|
|
|
class Scraper:
|
|
|
|
name = 'Rick and Morty py'
|
|
|
|
author = 'NotIsSet'
|
|
|
|
license = 'GPLv2'
|
|
|
|
year = '2017'
|
|
|
|
version = '0.0.1'
|
|
|
|
|
|
|
|
out_buffer = ''
|
|
|
|
out_buffer_time = 0
|
|
|
|
out_buffer_refresh_threshold = 0
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
def bytes_to_multiples(b):
|
|
|
|
divisor = 1
|
|
|
|
unit = 'b'
|
|
|
|
|
|
|
|
if b >= 1000000000:
|
|
|
|
unit = 'gb'
|
|
|
|
divisor = 1000000000
|
|
|
|
elif b >= 1000000:
|
|
|
|
unit = 'mb'
|
|
|
|
divisor = 1000000
|
|
|
|
elif b >= 1000:
|
|
|
|
unit = 'kb'
|
|
|
|
divisor = 1000
|
|
|
|
|
|
|
|
_ret = round((float(b) / divisor), 2)
|
|
|
|
if _ret - int(_ret) == 0.0:
|
|
|
|
_ret = int(_ret)
|
|
|
|
_ret = str(_ret) + unit
|
|
|
|
return _ret
|
2017-10-21 03:51:37 +02:00
|
|
|
|
2017-11-06 00:50:44 +01:00
|
|
|
@staticmethod
|
|
|
|
def get_speed_string(bps):
|
|
|
|
return Scraper.bytes_to_multiples(bps) + '/s'
|
2017-10-21 03:51:37 +02:00
|
|
|
|
2017-11-06 00:50:44 +01:00
|
|
|
@staticmethod
|
|
|
|
def get_completion(written, total):
|
|
|
|
out_percentage = round(((float(written * 100)) / total), 2)
|
|
|
|
if out_percentage - int(out_percentage) == 0.0:
|
|
|
|
out_percentage = int(out_percentage)
|
2017-10-21 03:51:37 +02:00
|
|
|
|
2017-11-06 00:50:44 +01:00
|
|
|
return '%s%% - %s/%s' % (
|
|
|
|
out_percentage,
|
|
|
|
Scraper.bytes_to_multiples(written),
|
|
|
|
Scraper.bytes_to_multiples(total)
|
|
|
|
)
|
2017-10-21 03:51:37 +02:00
|
|
|
|
2017-11-06 00:50:44 +01:00
|
|
|
@staticmethod
|
|
|
|
def calc_res(resolution):
|
2017-10-21 03:51:37 +02:00
|
|
|
if resolution is None:
|
|
|
|
return 0
|
2017-11-06 00:50:44 +01:00
|
|
|
factors = resolution.split('x')
|
|
|
|
if len(factors) < 2:
|
2017-10-21 03:51:37 +02:00
|
|
|
return 0
|
2017-11-06 00:50:44 +01:00
|
|
|
_ret = int(factors[0]) * int(factors[1])
|
2017-10-21 03:51:37 +02:00
|
|
|
return _ret
|
|
|
|
|
2017-11-06 00:50:44 +01:00
|
|
|
@staticmethod
|
|
|
|
def get_top_resolution(resolutions):
|
|
|
|
top_res = 0
|
|
|
|
_ret = resolutions[0][0]
|
|
|
|
for video in resolutions:
|
|
|
|
if Scraper.calc_res(video[1]) > top_res:
|
|
|
|
top_res = Scraper.calc_res(video[1])
|
|
|
|
_ret = video[0]
|
|
|
|
return _ret
|
|
|
|
|
|
|
|
def buffer_out(self, string, concatenate=True):
|
|
|
|
current_time = int(round(time.time() * 1000))
|
|
|
|
out = self.out_buffer + '\n' + string
|
|
|
|
if concatenate:
|
|
|
|
self.out_buffer = out
|
|
|
|
|
|
|
|
if current_time - self.out_buffer_time > self.out_buffer_refresh_threshold:
|
|
|
|
self.out_buffer_time = current_time
|
|
|
|
print '\x1b[2J' + out
|
|
|
|
|
|
|
|
def __init__(self):
|
|
|
|
self.info_header = '\n'.join([
|
|
|
|
'name: ' + self.name,
|
|
|
|
'author: ' + self.author,
|
|
|
|
'license: ' + self.license,
|
|
|
|
'year: ' + self.year,
|
|
|
|
'version: ' + self.version,
|
|
|
|
])
|
|
|
|
|
|
|
|
config = ConfigParser.RawConfigParser()
|
|
|
|
config.read('rickandmorty.conf')
|
|
|
|
self.url = config.get('Main', 'url')
|
|
|
|
|
|
|
|
self.headers = requests.utils.default_headers()
|
|
|
|
self.headers.update({"User-Agent": "Mozilla/5.0"})
|
|
|
|
|
|
|
|
def get_episode_url(self, season, episode):
|
|
|
|
url = self.url % (season, episode)
|
|
|
|
r = requests.get(url, headers=self.headers)
|
|
|
|
# self.buffer_out(url)
|
2017-10-21 03:51:37 +02:00
|
|
|
soup = BeautifulSoup(r.text, 'html.parser')
|
|
|
|
player = soup.find(id="player")
|
2017-11-06 00:50:44 +01:00
|
|
|
if player is None:
|
|
|
|
return None
|
|
|
|
frame_url = player.get('data-src')
|
|
|
|
if player is None:
|
|
|
|
return None
|
|
|
|
frame_url = frame_url.strip()
|
|
|
|
# self.buffer_out(frame_url)
|
|
|
|
r = requests.get(frame_url, headers=self.headers)
|
2017-10-21 03:51:37 +02:00
|
|
|
soup = BeautifulSoup(r.text, 'html.parser')
|
|
|
|
|
2017-11-06 00:50:44 +01:00
|
|
|
video_resolutions = [x.get('res') for x in soup.find_all(name="source")]
|
|
|
|
video_sources = [x.get('src') for x in soup.find_all(name="source")]
|
2017-10-21 03:51:37 +02:00
|
|
|
|
2017-11-06 00:50:44 +01:00
|
|
|
video_urls = zip(video_sources, video_resolutions)
|
2017-10-21 03:51:37 +02:00
|
|
|
|
2017-11-06 00:50:44 +01:00
|
|
|
return Scraper.get_top_resolution(video_urls)
|
2017-10-21 03:51:37 +02:00
|
|
|
|
2017-11-06 00:50:44 +01:00
|
|
|
def download_file(self, url, destination):
|
|
|
|
r = requests.get(url, stream=True, headers=self.headers)
|
|
|
|
written_chunks = 0
|
2017-10-21 03:51:37 +02:00
|
|
|
|
2017-11-06 00:50:44 +01:00
|
|
|
last_time = time.time()
|
2017-10-21 03:51:37 +02:00
|
|
|
|
2017-11-06 00:50:44 +01:00
|
|
|
with open(destination, 'wb') as f:
|
|
|
|
total_length = int(r.headers['Content-Length'])
|
|
|
|
speed_buffer = []
|
|
|
|
speed_buffer_size = 1000
|
|
|
|
|
|
|
|
for chunk in r.iter_content(chunk_size=1024):
|
|
|
|
if chunk: # filter out keep-alive new chunks
|
|
|
|
time_diff = time.time() - last_time
|
|
|
|
last_time = time.time()
|
|
|
|
|
|
|
|
speed_buffer.append(float(1024 / time_diff / 10))
|
|
|
|
if len(speed_buffer) > speed_buffer_size:
|
|
|
|
speed_buffer.pop(0)
|
|
|
|
|
|
|
|
f.write(chunk)
|
|
|
|
written_chunks += 1
|
|
|
|
|
|
|
|
completion = Scraper.get_completion(written_chunks * 1000, total_length / 1024 * 1000)
|
|
|
|
speed = Scraper.get_speed_string(float(sum(speed_buffer)) / speed_buffer_size)
|
|
|
|
|
|
|
|
if written_chunks % 20 == 0:
|
|
|
|
self.buffer_out(
|
|
|
|
'Download in progress...\n%s - %s\n' % (
|
|
|
|
completion, speed),
|
|
|
|
False)
|
|
|
|
self.buffer_out('Download complete')
|
|
|
|
|
|
|
|
def run(self, download=False):
|
|
|
|
self.buffer_out(self.info_header)
|
|
|
|
self.buffer_out(self.url)
|
|
|
|
for season in range(1, 4):
|
|
|
|
for episode in range(1, 12):
|
|
|
|
ep_url = self.get_episode_url(season, episode)
|
|
|
|
if ep_url is None:
|
|
|
|
self.buffer_out('season %s episode %s - WARN could not retrieve url' % (season, episode))
|
|
|
|
continue
|
|
|
|
self.buffer_out('season %s episode %s - url: %s' % (season, episode, ep_url))
|
|
|
|
if download:
|
|
|
|
destination = "s%se%s.mp4" % (season, episode)
|
|
|
|
self.download_file(ep_url, destination)
|
|
|
|
pass
|
2017-10-21 03:51:37 +02:00
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
scraper = Scraper()
|
2017-11-06 00:50:44 +01:00
|
|
|
scraper.run()
|