Initial commit

2017-10-21 03:51:37 +02:00 · 2017-10-21 03:51:37 +02:00 · b94920e6e5
commit b94920e6e5
6 changed files with 209 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,6 @@
 *mp4
 *mp4*
 *.swp
 .idea
 *.pyc
--- a/TestScraper.py
+++ b/TestScraper.py
@ -0,0 +1,71 @@
 #!/usr/bin/env python
 # TestCalculatorFunctions.py
 import unittest
 from scraper import Scraper
 from pprint import pprint
 class KnownValues(unittest.TestCase):
    scraper = Scraper();
    def test_instance(self):
        self.assertIsInstance(self.scraper, Scraper)
    def test_get_stations(self):
        fname = 'rickandmorty.conf'
        with open(fname) as fopen:
            self.assertTrue(True)
    def test_parse_stations(self):
        source = '''BARI CENTRALE|S11119
 BARI TORRE QUETTA|S11004
 BOLOGNA C.LE|S05043'''
        stations = self.scraper.parse_stations(source)
        self.assertListEqual(stations, [
            {'name': 'BARI CENTRALE', 'code': 'S11119'},
            {'name': 'BARI TORRE QUETTA', 'code': 'S11004'},
            {'name': 'BOLOGNA C.LE', 'code': 'S05043'},
        ])
        for station in stations:
            self.assertTrue('name' in station)
            self.assertTrue('code' in station)
    def test_parse_station(self):
        station = 'SAN LEONARDO DI CUTRO|S11827'
        expected = {'name': 'SAN LEONARDO DI CUTRO', 'code': 'S11827'}
        self.assertDictEqual(self.scraper.parse_station(station), expected)
        # def test_can_connect(self):
        #     scraper = Scraper()
        #     self.assertEqual(scraper.touch('http://ddg.gg'), 200)
        #
        # def test_get_page(self):
        #     scraper = Scraper()
        #     self.assertEqual(scraper.get_page().status_code, 200)
        #
        # def  test_format_hackerspace(self):
        #     scraper = Scraper()
        #     hackerspace =  {'name':'pippo'}
        #     formatted = scraper.format_hackerspace(hackerspace)
        #     self.assertTrue('url' in formatted)
        #
        #
        # def test_get_hackerspaces(self):
        #     scraper = Scraper()
        #     hackerspaces = scraper.get_hackerspaces()
        #     self.assertGreater(len(hackerspaces), 0)
        #
        #     for hackerspace in hackerspaces:
        #         self.assertTrue('url' in hackerspace)
        #
        # def test_convert_text_field_to_hs_url(self):
        #     scraper = Scraper()
        #     textfield = '<b><a href="/Freaknet" title="Freaknet">Freaknet</a></b>'
        #     self.assertEqual(scraper.convert_text_field_to_hs_url(textfield), 'https://wiki.hackerspaces.org/Freaknet')
 if __name__ == '__main__':
    unittest.main()
--- a/fetch.sh
+++ b/fetch.sh
@ -0,0 +1,10 @@
 #/bin/bash
 for season in $(seq 1 3); do for episode in $(seq 1 11)
 do
 	as='curl -m 0 -s http://watchseries.do/series/rick-and-morty/season/'$season'/episode/'$episode' | grep player | grep estream | cut -d "\"" --fields=4 | xargs -t curl  | grep mp4 | cut -d \" -f 2' 
 #	as='https://it.wikipedia.org/wiki/'$season''$episode
 	wget $(eval $as) -o "s"$season"e"$episode".mp4"
 done;done
--- a/rickandmorty.conf
+++ b/rickandmorty.conf
@ -0,0 +1,2 @@
 [Main]
 url=http://watchseries.do/series/rick-and-morty/season/%s/episode/%s
--- a/scraper.py
+++ b/scraper.py
@ -0,0 +1,108 @@
 import requests, json, ConfigParser, os
 from bs4 import BeautifulSoup
 from pprint import pprint
 class Scraper():
    # url = 'http://www.viaggiatreno.it/viaggiatrenonew/resteasy/viaggiatreno/partenze/S01480/Tue%20Oct%2011%202017%2008:30:00%20GMT+0200%20(CEST)'
    def __init__(self):
        config = ConfigParser.RawConfigParser()
        config.read('rickandmorty.conf')
        # getfloat() raises an exception if the value is not a float
        # getint() and getboolean() also do this for their respective types
        self.url = config.get('Main', 'url')
        pprint(self.url)
        # config.read(['site.cfg', os.path.expanduser('~/.rickandmorty.conf')])
        self.load_page()
        pass
    def calc_res(self, resolution):
        if resolution is None:
            return 0
        vals = resolution.split('x')
        if (len(vals) < 2):
            return 0
        pprint(vals)
        _ret = int(vals[0]) * int(vals[1])
        return _ret
    def load_page(self):
        url = self.url % (1, 1)
        r = requests.get(url)
        # pprint(content)
        pprint(url)
        soup = BeautifulSoup(r.text, 'html.parser')
        player = soup.find(id="player")
        frameUrl = player.get('data-src').strip()
        r = requests.get(frameUrl)
        # pprint(content)
        pprint(url)
        soup = BeautifulSoup(r.text, 'html.parser')
        videoResolutions = [x.get('res') for x in soup.find_all(name="source")]
        videoSources = [x.get('src') for x in soup.find_all(name="source")]
        pprint(videoSources)
        pprint(videoResolutions)
        videoUrls = zip(videoSources, videoResolutions)
        topRes = 0
        curTop = videoUrls[0][0]
        for video in videoUrls:
            if (self.calc_res(video[1]) > topRes):
                topRes = self.calc_res(video[1])
                curTop = video[0]
        url = curTop
        # frameUrl = player.get('data-src').strip()
        print "downloading with requests"
        local_filename = "s01e01.mp4"
        # NOTE the stream=True parameter
        r = requests.get(url, stream=True)
        with open(local_filename, 'wb') as f:
            for chunk in r.iter_content(chunk_size=1024):
                if chunk:  # filter out keep-alive new chunks
                    print('writing chunk...\n')
                    f.write(chunk)
                    # f.flush() commented by recommendation from J.F.Sebastian
        # r = requests.get(url)
        # with open("s01e01.mp4", "wb") as code:
        #     code.write(r.content)
        # pprint(url)
        # pprint(frameUrl)
        # iframe#player
 def parse_stations(self, stations):
    _ret = []
    for station in stations.split('\n'):
        if len(station) > 0:
            _ret.append(self.parse_station(station))
    return _ret
 def parse_station(self, station):
    stat = station.split('|')
    return {
        'name': stat[0].strip(),
        'code': stat[1].strip()
    }
 def find_stations(self, station_name, stations):
    _ret = []
    for station in stations:
        if station_name.lower() in station['name'].lower():
            _ret.append(station)
    return _ret
 if __name__ == '__main__':
    scraper = Scraper()
    stations = scraper.get_stations('elenco_stazioni.txt')
    pprint(stations)
--- a/status.sh
+++ b/status.sh
@ -0,0 +1,12 @@
 #/bin/sh
 #while true; do
 files="$(ls -lah | grep 's[1-9]*e[1-9]*\.mp4' | cut -d' ' -f10,6)"
 #files="$(ls -lah | awk '/s[1-9]\*e[1-9]\*\.mp4/{print $10}')"
 printf "%s\n\n" "$files"
 	sleep 1
 #done
 exec ./status.sh
		`@ -0,0 +1,2 @@`
							`[Main]`
							`url=http://watchseries.do/series/rick-and-morty/season/%s/episode/%s`