commit b94920e6e5a739d5cce2e37bdb9f6aa60156e1e9 Author: valerio Date: Sat Oct 21 03:51:37 2017 +0200 Initial commit diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..fdb008f --- /dev/null +++ b/.gitignore @@ -0,0 +1,6 @@ +*mp4 +*mp4* +*.swp + +.idea +*.pyc diff --git a/TestScraper.py b/TestScraper.py new file mode 100755 index 0000000..833d000 --- /dev/null +++ b/TestScraper.py @@ -0,0 +1,71 @@ +#!/usr/bin/env python +# TestCalculatorFunctions.py + +import unittest +from scraper import Scraper +from pprint import pprint + + +class KnownValues(unittest.TestCase): + scraper = Scraper(); + + def test_instance(self): + self.assertIsInstance(self.scraper, Scraper) + + def test_get_stations(self): + + fname = 'rickandmorty.conf' + with open(fname) as fopen: + self.assertTrue(True) + + def test_parse_stations(self): + source = '''BARI CENTRALE|S11119 +BARI TORRE QUETTA|S11004 +BOLOGNA C.LE|S05043''' + stations = self.scraper.parse_stations(source) + self.assertListEqual(stations, [ + {'name': 'BARI CENTRALE', 'code': 'S11119'}, + {'name': 'BARI TORRE QUETTA', 'code': 'S11004'}, + {'name': 'BOLOGNA C.LE', 'code': 'S05043'}, + ]) + for station in stations: + self.assertTrue('name' in station) + self.assertTrue('code' in station) + + def test_parse_station(self): + station = 'SAN LEONARDO DI CUTRO|S11827' + expected = {'name': 'SAN LEONARDO DI CUTRO', 'code': 'S11827'} + self.assertDictEqual(self.scraper.parse_station(station), expected) + + + # def test_can_connect(self): + # scraper = Scraper() + # self.assertEqual(scraper.touch('http://ddg.gg'), 200) + # + # def test_get_page(self): + # scraper = Scraper() + # self.assertEqual(scraper.get_page().status_code, 200) + # + # def test_format_hackerspace(self): + # scraper = Scraper() + # hackerspace = {'name':'pippo'} + # formatted = scraper.format_hackerspace(hackerspace) + # self.assertTrue('url' in formatted) + # + # + # def test_get_hackerspaces(self): + # scraper = Scraper() + # hackerspaces = scraper.get_hackerspaces() + # self.assertGreater(len(hackerspaces), 0) + # + # for hackerspace in hackerspaces: + # self.assertTrue('url' in hackerspace) + # + # def test_convert_text_field_to_hs_url(self): + # scraper = Scraper() + # textfield = 'Freaknet' + # self.assertEqual(scraper.convert_text_field_to_hs_url(textfield), 'https://wiki.hackerspaces.org/Freaknet') + + +if __name__ == '__main__': + unittest.main() diff --git a/fetch.sh b/fetch.sh new file mode 100755 index 0000000..b268629 --- /dev/null +++ b/fetch.sh @@ -0,0 +1,10 @@ +#/bin/bash + +for season in $(seq 1 3); do for episode in $(seq 1 11) +do + + as='curl -m 0 -s http://watchseries.do/series/rick-and-morty/season/'$season'/episode/'$episode' | grep player | grep estream | cut -d "\"" --fields=4 | xargs -t curl | grep mp4 | cut -d \" -f 2' +# as='https://it.wikipedia.org/wiki/'$season''$episode + wget $(eval $as) -o "s"$season"e"$episode".mp4" + +done;done diff --git a/rickandmorty.conf b/rickandmorty.conf new file mode 100644 index 0000000..3e0244c --- /dev/null +++ b/rickandmorty.conf @@ -0,0 +1,2 @@ +[Main] +url=http://watchseries.do/series/rick-and-morty/season/%s/episode/%s \ No newline at end of file diff --git a/scraper.py b/scraper.py new file mode 100644 index 0000000..29c0025 --- /dev/null +++ b/scraper.py @@ -0,0 +1,108 @@ +import requests, json, ConfigParser, os +from bs4 import BeautifulSoup +from pprint import pprint + + +class Scraper(): + # url = 'http://www.viaggiatreno.it/viaggiatrenonew/resteasy/viaggiatreno/partenze/S01480/Tue%20Oct%2011%202017%2008:30:00%20GMT+0200%20(CEST)' + + def __init__(self): + config = ConfigParser.RawConfigParser() + config.read('rickandmorty.conf') + + # getfloat() raises an exception if the value is not a float + # getint() and getboolean() also do this for their respective types + self.url = config.get('Main', 'url') + + pprint(self.url) + + # config.read(['site.cfg', os.path.expanduser('~/.rickandmorty.conf')]) + self.load_page() + pass + + def calc_res(self, resolution): + if resolution is None: + return 0 + vals = resolution.split('x') + if (len(vals) < 2): + return 0 + pprint(vals) + _ret = int(vals[0]) * int(vals[1]) + return _ret + + def load_page(self): + url = self.url % (1, 1) + r = requests.get(url) + # pprint(content) + pprint(url) + soup = BeautifulSoup(r.text, 'html.parser') + player = soup.find(id="player") + frameUrl = player.get('data-src').strip() + + r = requests.get(frameUrl) + # pprint(content) + pprint(url) + soup = BeautifulSoup(r.text, 'html.parser') + videoResolutions = [x.get('res') for x in soup.find_all(name="source")] + videoSources = [x.get('src') for x in soup.find_all(name="source")] + pprint(videoSources) + pprint(videoResolutions) + + videoUrls = zip(videoSources, videoResolutions) + topRes = 0 + curTop = videoUrls[0][0] + for video in videoUrls: + if (self.calc_res(video[1]) > topRes): + topRes = self.calc_res(video[1]) + curTop = video[0] + url = curTop + + # frameUrl = player.get('data-src').strip() + + print "downloading with requests" + local_filename = "s01e01.mp4" + # NOTE the stream=True parameter + r = requests.get(url, stream=True) + with open(local_filename, 'wb') as f: + for chunk in r.iter_content(chunk_size=1024): + if chunk: # filter out keep-alive new chunks + print('writing chunk...\n') + f.write(chunk) + # f.flush() commented by recommendation from J.F.Sebastian + # r = requests.get(url) + # with open("s01e01.mp4", "wb") as code: + # code.write(r.content) + # pprint(url) + # pprint(frameUrl) + # iframe#player + + +def parse_stations(self, stations): + _ret = [] + for station in stations.split('\n'): + if len(station) > 0: + _ret.append(self.parse_station(station)) + return _ret + + +def parse_station(self, station): + stat = station.split('|') + return { + 'name': stat[0].strip(), + 'code': stat[1].strip() + } + + +def find_stations(self, station_name, stations): + _ret = [] + for station in stations: + if station_name.lower() in station['name'].lower(): + _ret.append(station) + return _ret + + +if __name__ == '__main__': + scraper = Scraper() + + stations = scraper.get_stations('elenco_stazioni.txt') + pprint(stations) diff --git a/status.sh b/status.sh new file mode 100755 index 0000000..4d0a3e7 --- /dev/null +++ b/status.sh @@ -0,0 +1,12 @@ +#/bin/sh + +#while true; do + + +files="$(ls -lah | grep 's[1-9]*e[1-9]*\.mp4' | cut -d' ' -f10,6)" +#files="$(ls -lah | awk '/s[1-9]\*e[1-9]\*\.mp4/{print $10}')" +printf "%s\n\n" "$files" + sleep 1 +#done +exec ./status.sh +