import requests, json, ConfigParser, os from bs4 import BeautifulSoup from pprint import pprint class Scraper(): # url = 'http://www.viaggiatreno.it/viaggiatrenonew/resteasy/viaggiatreno/partenze/S01480/Tue%20Oct%2011%202017%2008:30:00%20GMT+0200%20(CEST)' def __init__(self): config = ConfigParser.RawConfigParser() config.read('rickandmorty.conf') # getfloat() raises an exception if the value is not a float # getint() and getboolean() also do this for their respective types self.url = config.get('Main', 'url') pprint(self.url) # config.read(['site.cfg', os.path.expanduser('~/.rickandmorty.conf')]) self.load_page() pass def calc_res(self, resolution): if resolution is None: return 0 vals = resolution.split('x') if (len(vals) < 2): return 0 pprint(vals) _ret = int(vals[0]) * int(vals[1]) return _ret def load_page(self): url = self.url % (1, 1) r = requests.get(url) # pprint(content) pprint(url) soup = BeautifulSoup(r.text, 'html.parser') player = soup.find(id="player") frameUrl = player.get('data-src').strip() r = requests.get(frameUrl) # pprint(content) pprint(url) soup = BeautifulSoup(r.text, 'html.parser') videoResolutions = [x.get('res') for x in soup.find_all(name="source")] videoSources = [x.get('src') for x in soup.find_all(name="source")] pprint(videoSources) pprint(videoResolutions) videoUrls = zip(videoSources, videoResolutions) topRes = 0 curTop = videoUrls[0][0] for video in videoUrls: if (self.calc_res(video[1]) > topRes): topRes = self.calc_res(video[1]) curTop = video[0] url = curTop # frameUrl = player.get('data-src').strip() print "downloading with requests" local_filename = "s01e01.mp4" # NOTE the stream=True parameter r = requests.get(url, stream=True) with open(local_filename, 'wb') as f: for chunk in r.iter_content(chunk_size=1024): if chunk: # filter out keep-alive new chunks print('writing chunk...\n') f.write(chunk) # f.flush() commented by recommendation from J.F.Sebastian # r = requests.get(url) # with open("s01e01.mp4", "wb") as code: # code.write(r.content) # pprint(url) # pprint(frameUrl) # iframe#player def parse_stations(self, stations): _ret = [] for station in stations.split('\n'): if len(station) > 0: _ret.append(self.parse_station(station)) return _ret def parse_station(self, station): stat = station.split('|') return { 'name': stat[0].strip(), 'code': stat[1].strip() } def find_stations(self, station_name, stations): _ret = [] for station in stations: if station_name.lower() in station['name'].lower(): _ret.append(station) return _ret if __name__ == '__main__': scraper = Scraper() stations = scraper.get_stations('elenco_stazioni.txt') pprint(stations)