rick-and-morty-downloader-py/scraper.py

109 lines
3.2 KiB
Python
Raw Normal View History

2017-10-21 03:51:37 +02:00
import requests, json, ConfigParser, os
from bs4 import BeautifulSoup
from pprint import pprint
class Scraper():
# url = 'http://www.viaggiatreno.it/viaggiatrenonew/resteasy/viaggiatreno/partenze/S01480/Tue%20Oct%2011%202017%2008:30:00%20GMT+0200%20(CEST)'
def __init__(self):
config = ConfigParser.RawConfigParser()
config.read('rickandmorty.conf')
# getfloat() raises an exception if the value is not a float
# getint() and getboolean() also do this for their respective types
self.url = config.get('Main', 'url')
pprint(self.url)
# config.read(['site.cfg', os.path.expanduser('~/.rickandmorty.conf')])
self.load_page()
pass
def calc_res(self, resolution):
if resolution is None:
return 0
vals = resolution.split('x')
if (len(vals) < 2):
return 0
pprint(vals)
_ret = int(vals[0]) * int(vals[1])
return _ret
def load_page(self):
url = self.url % (1, 1)
r = requests.get(url)
# pprint(content)
pprint(url)
soup = BeautifulSoup(r.text, 'html.parser')
player = soup.find(id="player")
frameUrl = player.get('data-src').strip()
r = requests.get(frameUrl)
# pprint(content)
pprint(url)
soup = BeautifulSoup(r.text, 'html.parser')
videoResolutions = [x.get('res') for x in soup.find_all(name="source")]
videoSources = [x.get('src') for x in soup.find_all(name="source")]
pprint(videoSources)
pprint(videoResolutions)
videoUrls = zip(videoSources, videoResolutions)
topRes = 0
curTop = videoUrls[0][0]
for video in videoUrls:
if (self.calc_res(video[1]) > topRes):
topRes = self.calc_res(video[1])
curTop = video[0]
url = curTop
# frameUrl = player.get('data-src').strip()
print "downloading with requests"
local_filename = "s01e01.mp4"
# NOTE the stream=True parameter
r = requests.get(url, stream=True)
with open(local_filename, 'wb') as f:
for chunk in r.iter_content(chunk_size=1024):
if chunk: # filter out keep-alive new chunks
print('writing chunk...\n')
f.write(chunk)
# f.flush() commented by recommendation from J.F.Sebastian
# r = requests.get(url)
# with open("s01e01.mp4", "wb") as code:
# code.write(r.content)
# pprint(url)
# pprint(frameUrl)
# iframe#player
def parse_stations(self, stations):
_ret = []
for station in stations.split('\n'):
if len(station) > 0:
_ret.append(self.parse_station(station))
return _ret
def parse_station(self, station):
stat = station.split('|')
return {
'name': stat[0].strip(),
'code': stat[1].strip()
}
def find_stations(self, station_name, stations):
_ret = []
for station in stations:
if station_name.lower() in station['name'].lower():
_ret.append(station)
return _ret
if __name__ == '__main__':
scraper = Scraper()
stations = scraper.get_stations('elenco_stazioni.txt')
pprint(stations)