Browse Source

Initial commit

master
valerio 5 years ago
commit
b94920e6e5
  1. 6
      .gitignore
  2. 71
      TestScraper.py
  3. 10
      fetch.sh
  4. 2
      rickandmorty.conf
  5. 108
      scraper.py
  6. 12
      status.sh

6
.gitignore

@ -0,0 +1,6 @@
*mp4
*mp4*
*.swp
.idea
*.pyc

71
TestScraper.py

@ -0,0 +1,71 @@
#!/usr/bin/env python
# TestCalculatorFunctions.py
import unittest
from scraper import Scraper
from pprint import pprint
class KnownValues(unittest.TestCase):
scraper = Scraper();
def test_instance(self):
self.assertIsInstance(self.scraper, Scraper)
def test_get_stations(self):
fname = 'rickandmorty.conf'
with open(fname) as fopen:
self.assertTrue(True)
def test_parse_stations(self):
source = '''BARI CENTRALE|S11119
BARI TORRE QUETTA|S11004
BOLOGNA C.LE|S05043'''
stations = self.scraper.parse_stations(source)
self.assertListEqual(stations, [
{'name': 'BARI CENTRALE', 'code': 'S11119'},
{'name': 'BARI TORRE QUETTA', 'code': 'S11004'},
{'name': 'BOLOGNA C.LE', 'code': 'S05043'},
])
for station in stations:
self.assertTrue('name' in station)
self.assertTrue('code' in station)
def test_parse_station(self):
station = 'SAN LEONARDO DI CUTRO|S11827'
expected = {'name': 'SAN LEONARDO DI CUTRO', 'code': 'S11827'}
self.assertDictEqual(self.scraper.parse_station(station), expected)
# def test_can_connect(self):
# scraper = Scraper()
# self.assertEqual(scraper.touch('http://ddg.gg'), 200)
#
# def test_get_page(self):
# scraper = Scraper()
# self.assertEqual(scraper.get_page().status_code, 200)
#
# def test_format_hackerspace(self):
# scraper = Scraper()
# hackerspace = {'name':'pippo'}
# formatted = scraper.format_hackerspace(hackerspace)
# self.assertTrue('url' in formatted)
#
#
# def test_get_hackerspaces(self):
# scraper = Scraper()
# hackerspaces = scraper.get_hackerspaces()
# self.assertGreater(len(hackerspaces), 0)
#
# for hackerspace in hackerspaces:
# self.assertTrue('url' in hackerspace)
#
# def test_convert_text_field_to_hs_url(self):
# scraper = Scraper()
# textfield = '<b><a href="/Freaknet" title="Freaknet">Freaknet</a></b>'
# self.assertEqual(scraper.convert_text_field_to_hs_url(textfield), 'https://wiki.hackerspaces.org/Freaknet')
if __name__ == '__main__':
unittest.main()

10
fetch.sh

@ -0,0 +1,10 @@
#/bin/bash
for season in $(seq 1 3); do for episode in $(seq 1 11)
do
as='curl -m 0 -s http://watchseries.do/series/rick-and-morty/season/'$season'/episode/'$episode' | grep player | grep estream | cut -d "\"" --fields=4 | xargs -t curl | grep mp4 | cut -d \" -f 2'
# as='https://it.wikipedia.org/wiki/'$season''$episode
wget $(eval $as) -o "s"$season"e"$episode".mp4"
done;done

2
rickandmorty.conf

@ -0,0 +1,2 @@
[Main]
url=http://watchseries.do/series/rick-and-morty/season/%s/episode/%s

108
scraper.py

@ -0,0 +1,108 @@
import requests, json, ConfigParser, os
from bs4 import BeautifulSoup
from pprint import pprint
class Scraper():
# url = 'http://www.viaggiatreno.it/viaggiatrenonew/resteasy/viaggiatreno/partenze/S01480/Tue%20Oct%2011%202017%2008:30:00%20GMT+0200%20(CEST)'
def __init__(self):
config = ConfigParser.RawConfigParser()
config.read('rickandmorty.conf')
# getfloat() raises an exception if the value is not a float
# getint() and getboolean() also do this for their respective types
self.url = config.get('Main', 'url')
pprint(self.url)
# config.read(['site.cfg', os.path.expanduser('~/.rickandmorty.conf')])
self.load_page()
pass
def calc_res(self, resolution):
if resolution is None:
return 0
vals = resolution.split('x')
if (len(vals) < 2):
return 0
pprint(vals)
_ret = int(vals[0]) * int(vals[1])
return _ret
def load_page(self):
url = self.url % (1, 1)
r = requests.get(url)
# pprint(content)
pprint(url)
soup = BeautifulSoup(r.text, 'html.parser')
player = soup.find(id="player")
frameUrl = player.get('data-src').strip()
r = requests.get(frameUrl)
# pprint(content)
pprint(url)
soup = BeautifulSoup(r.text, 'html.parser')
videoResolutions = [x.get('res') for x in soup.find_all(name="source")]
videoSources = [x.get('src') for x in soup.find_all(name="source")]
pprint(videoSources)
pprint(videoResolutions)
videoUrls = zip(videoSources, videoResolutions)
topRes = 0
curTop = videoUrls[0][0]
for video in videoUrls:
if (self.calc_res(video[1]) > topRes):
topRes = self.calc_res(video[1])
curTop = video[0]
url = curTop
# frameUrl = player.get('data-src').strip()
print "downloading with requests"
local_filename = "s01e01.mp4"
# NOTE the stream=True parameter
r = requests.get(url, stream=True)
with open(local_filename, 'wb') as f:
for chunk in r.iter_content(chunk_size=1024):
if chunk: # filter out keep-alive new chunks
print('writing chunk...\n')
f.write(chunk)
# f.flush() commented by recommendation from J.F.Sebastian
# r = requests.get(url)
# with open("s01e01.mp4", "wb") as code:
# code.write(r.content)
# pprint(url)
# pprint(frameUrl)
# iframe#player
def parse_stations(self, stations):
_ret = []
for station in stations.split('\n'):
if len(station) > 0:
_ret.append(self.parse_station(station))
return _ret
def parse_station(self, station):
stat = station.split('|')
return {
'name': stat[0].strip(),
'code': stat[1].strip()
}
def find_stations(self, station_name, stations):
_ret = []
for station in stations:
if station_name.lower() in station['name'].lower():
_ret.append(station)
return _ret
if __name__ == '__main__':
scraper = Scraper()
stations = scraper.get_stations('elenco_stazioni.txt')
pprint(stations)

12
status.sh

@ -0,0 +1,12 @@
#/bin/sh
#while true; do
files="$(ls -lah | grep 's[1-9]*e[1-9]*\.mp4' | cut -d' ' -f10,6)"
#files="$(ls -lah | awk '/s[1-9]\*e[1-9]\*\.mp4/{print $10}')"
printf "%s\n\n" "$files"
sleep 1
#done
exec ./status.sh
Loading…
Cancel
Save