Initial commit
This commit is contained in:
commit
b94920e6e5
6
.gitignore
vendored
Normal file
6
.gitignore
vendored
Normal file
|
@ -0,0 +1,6 @@
|
|||
*mp4
|
||||
*mp4*
|
||||
*.swp
|
||||
|
||||
.idea
|
||||
*.pyc
|
71
TestScraper.py
Executable file
71
TestScraper.py
Executable file
|
@ -0,0 +1,71 @@
|
|||
#!/usr/bin/env python
|
||||
# TestCalculatorFunctions.py
|
||||
|
||||
import unittest
|
||||
from scraper import Scraper
|
||||
from pprint import pprint
|
||||
|
||||
|
||||
class KnownValues(unittest.TestCase):
|
||||
scraper = Scraper();
|
||||
|
||||
def test_instance(self):
|
||||
self.assertIsInstance(self.scraper, Scraper)
|
||||
|
||||
def test_get_stations(self):
|
||||
|
||||
fname = 'rickandmorty.conf'
|
||||
with open(fname) as fopen:
|
||||
self.assertTrue(True)
|
||||
|
||||
def test_parse_stations(self):
|
||||
source = '''BARI CENTRALE|S11119
|
||||
BARI TORRE QUETTA|S11004
|
||||
BOLOGNA C.LE|S05043'''
|
||||
stations = self.scraper.parse_stations(source)
|
||||
self.assertListEqual(stations, [
|
||||
{'name': 'BARI CENTRALE', 'code': 'S11119'},
|
||||
{'name': 'BARI TORRE QUETTA', 'code': 'S11004'},
|
||||
{'name': 'BOLOGNA C.LE', 'code': 'S05043'},
|
||||
])
|
||||
for station in stations:
|
||||
self.assertTrue('name' in station)
|
||||
self.assertTrue('code' in station)
|
||||
|
||||
def test_parse_station(self):
|
||||
station = 'SAN LEONARDO DI CUTRO|S11827'
|
||||
expected = {'name': 'SAN LEONARDO DI CUTRO', 'code': 'S11827'}
|
||||
self.assertDictEqual(self.scraper.parse_station(station), expected)
|
||||
|
||||
|
||||
# def test_can_connect(self):
|
||||
# scraper = Scraper()
|
||||
# self.assertEqual(scraper.touch('http://ddg.gg'), 200)
|
||||
#
|
||||
# def test_get_page(self):
|
||||
# scraper = Scraper()
|
||||
# self.assertEqual(scraper.get_page().status_code, 200)
|
||||
#
|
||||
# def test_format_hackerspace(self):
|
||||
# scraper = Scraper()
|
||||
# hackerspace = {'name':'pippo'}
|
||||
# formatted = scraper.format_hackerspace(hackerspace)
|
||||
# self.assertTrue('url' in formatted)
|
||||
#
|
||||
#
|
||||
# def test_get_hackerspaces(self):
|
||||
# scraper = Scraper()
|
||||
# hackerspaces = scraper.get_hackerspaces()
|
||||
# self.assertGreater(len(hackerspaces), 0)
|
||||
#
|
||||
# for hackerspace in hackerspaces:
|
||||
# self.assertTrue('url' in hackerspace)
|
||||
#
|
||||
# def test_convert_text_field_to_hs_url(self):
|
||||
# scraper = Scraper()
|
||||
# textfield = '<b><a href="/Freaknet" title="Freaknet">Freaknet</a></b>'
|
||||
# self.assertEqual(scraper.convert_text_field_to_hs_url(textfield), 'https://wiki.hackerspaces.org/Freaknet')
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
10
fetch.sh
Executable file
10
fetch.sh
Executable file
|
@ -0,0 +1,10 @@
|
|||
#/bin/bash
|
||||
|
||||
for season in $(seq 1 3); do for episode in $(seq 1 11)
|
||||
do
|
||||
|
||||
as='curl -m 0 -s http://watchseries.do/series/rick-and-morty/season/'$season'/episode/'$episode' | grep player | grep estream | cut -d "\"" --fields=4 | xargs -t curl | grep mp4 | cut -d \" -f 2'
|
||||
# as='https://it.wikipedia.org/wiki/'$season''$episode
|
||||
wget $(eval $as) -o "s"$season"e"$episode".mp4"
|
||||
|
||||
done;done
|
2
rickandmorty.conf
Normal file
2
rickandmorty.conf
Normal file
|
@ -0,0 +1,2 @@
|
|||
[Main]
|
||||
url=http://watchseries.do/series/rick-and-morty/season/%s/episode/%s
|
108
scraper.py
Normal file
108
scraper.py
Normal file
|
@ -0,0 +1,108 @@
|
|||
import requests, json, ConfigParser, os
|
||||
from bs4 import BeautifulSoup
|
||||
from pprint import pprint
|
||||
|
||||
|
||||
class Scraper():
|
||||
# url = 'http://www.viaggiatreno.it/viaggiatrenonew/resteasy/viaggiatreno/partenze/S01480/Tue%20Oct%2011%202017%2008:30:00%20GMT+0200%20(CEST)'
|
||||
|
||||
def __init__(self):
|
||||
config = ConfigParser.RawConfigParser()
|
||||
config.read('rickandmorty.conf')
|
||||
|
||||
# getfloat() raises an exception if the value is not a float
|
||||
# getint() and getboolean() also do this for their respective types
|
||||
self.url = config.get('Main', 'url')
|
||||
|
||||
pprint(self.url)
|
||||
|
||||
# config.read(['site.cfg', os.path.expanduser('~/.rickandmorty.conf')])
|
||||
self.load_page()
|
||||
pass
|
||||
|
||||
def calc_res(self, resolution):
|
||||
if resolution is None:
|
||||
return 0
|
||||
vals = resolution.split('x')
|
||||
if (len(vals) < 2):
|
||||
return 0
|
||||
pprint(vals)
|
||||
_ret = int(vals[0]) * int(vals[1])
|
||||
return _ret
|
||||
|
||||
def load_page(self):
|
||||
url = self.url % (1, 1)
|
||||
r = requests.get(url)
|
||||
# pprint(content)
|
||||
pprint(url)
|
||||
soup = BeautifulSoup(r.text, 'html.parser')
|
||||
player = soup.find(id="player")
|
||||
frameUrl = player.get('data-src').strip()
|
||||
|
||||
r = requests.get(frameUrl)
|
||||
# pprint(content)
|
||||
pprint(url)
|
||||
soup = BeautifulSoup(r.text, 'html.parser')
|
||||
videoResolutions = [x.get('res') for x in soup.find_all(name="source")]
|
||||
videoSources = [x.get('src') for x in soup.find_all(name="source")]
|
||||
pprint(videoSources)
|
||||
pprint(videoResolutions)
|
||||
|
||||
videoUrls = zip(videoSources, videoResolutions)
|
||||
topRes = 0
|
||||
curTop = videoUrls[0][0]
|
||||
for video in videoUrls:
|
||||
if (self.calc_res(video[1]) > topRes):
|
||||
topRes = self.calc_res(video[1])
|
||||
curTop = video[0]
|
||||
url = curTop
|
||||
|
||||
# frameUrl = player.get('data-src').strip()
|
||||
|
||||
print "downloading with requests"
|
||||
local_filename = "s01e01.mp4"
|
||||
# NOTE the stream=True parameter
|
||||
r = requests.get(url, stream=True)
|
||||
with open(local_filename, 'wb') as f:
|
||||
for chunk in r.iter_content(chunk_size=1024):
|
||||
if chunk: # filter out keep-alive new chunks
|
||||
print('writing chunk...\n')
|
||||
f.write(chunk)
|
||||
# f.flush() commented by recommendation from J.F.Sebastian
|
||||
# r = requests.get(url)
|
||||
# with open("s01e01.mp4", "wb") as code:
|
||||
# code.write(r.content)
|
||||
# pprint(url)
|
||||
# pprint(frameUrl)
|
||||
# iframe#player
|
||||
|
||||
|
||||
def parse_stations(self, stations):
|
||||
_ret = []
|
||||
for station in stations.split('\n'):
|
||||
if len(station) > 0:
|
||||
_ret.append(self.parse_station(station))
|
||||
return _ret
|
||||
|
||||
|
||||
def parse_station(self, station):
|
||||
stat = station.split('|')
|
||||
return {
|
||||
'name': stat[0].strip(),
|
||||
'code': stat[1].strip()
|
||||
}
|
||||
|
||||
|
||||
def find_stations(self, station_name, stations):
|
||||
_ret = []
|
||||
for station in stations:
|
||||
if station_name.lower() in station['name'].lower():
|
||||
_ret.append(station)
|
||||
return _ret
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
scraper = Scraper()
|
||||
|
||||
stations = scraper.get_stations('elenco_stazioni.txt')
|
||||
pprint(stations)
|
Loading…
Reference in New Issue
Block a user