forked from paulproteus/python-scraping-code-samples
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathserial.py
More file actions
72 lines (57 loc) · 6.34 KB
/
serial.py
File metadata and controls
72 lines (57 loc) · 6.34 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
import feedparser
import multiprocessing
LIST_OF_URLS = ['http://distrowatch.com/news/oggcast.xml', 'http://distrowatch.com/news/oggcast.xml', 'http://distrowatch.com/news/podcast.xml', 'http://distrowatch.com/news/podcast.xml', 'http://www.thesourceshow.org/xvid.xml', 'http://www.thesourceshow.org/xvid.xml', 'http://goinglinux.com/oggpodcast.xml', 'http://goinglinux.com/oggpodcast.xml', 'http://goinglinux.com/mp3podcast.xml', 'http://goinglinux.com/mp3podcast.xml', 'http://linuxcrazy.com/podcasts/ogg.xml', 'http://linuxcrazy.com/podcasts/ogg.xml', 'http://linuxcrazy.com/podcasts/Poderator.xml', 'http://linuxcrazy.com/podcasts/Poderator.xml', 'http://thebadapples.info/ogg.xml', 'http://thebadapples.info/ogg.xml', 'http://setbit.org/lt-ogg.xml', 'http://setbit.org/lt-ogg.xml', 'http://leoville.tv/podcasts/floss.xml', 'http://leoville.tv/podcasts/floss.xml', 'http://talkgeektome.us/ogg.xml', 'http://talkgeektome.us/ogg.xml', 'http://talkgeektome.us/mp3.xml', 'http://talkgeektome.us/mp3.xml', 'http://talkgeektome.us/flac.xml', 'http://talkgeektome.us/flac.xml', 'http://www2.madphilosopher.ca/bsdtalk_ogg.xml', 'http://www2.madphilosopher.ca/bsdtalk_ogg.xml', 'http://www.hwhq.com/rssOGG.xml', 'http://www.hwhq.com/rssOGG.xml', 'http://hwhq.com/rss.xml', 'http://hwhq.com/rss.xml', 'http://gopher.info-underground.net:70/iu/rss-iu.xml', 'http://gopher.info-underground.net:70/iu/rss-iu.xml', 'http://lottalinuxlinks.com/podcast/ogg.xml', 'http://lottalinuxlinks.com/podcast/ogg.xml', 'http://ubuntuos.com/podcast/ubuntuos-ogg.xml', 'http://ubuntuos.com/podcast/ubuntuos-ogg.xml', 'http://ubuntuos.com/podcast/ubuntuos-mp3.xml', 'http://ubuntuos.com/podcast/ubuntuos-mp3.xml', 'http://rss.ittoolbox.com/rss/security-investigator-podcast.xml', 'http://rss.ittoolbox.com/rss/security-investigator-podcast.xml', 'http://thelinuxbox.org/podcast.xml', 'http://thelinuxbox.org/podcast.xml', 'http://www.infonomicon.org/info.xml', 'http://www.infonomicon.org/info.xml', 'http://thelip.net/lipogg.xml', 'http://thelip.net/lipogg.xml', 'http://thelip.net/lipmp3.xml', 'http://thelip.net/lipmp3.xml', 'http://podcast.linuxgames.com/feeds.xml', 'http://podcast.linuxgames.com/feeds.xml', 'http://www.opennewsshow.org/ogg.xml', 'http://www.opennewsshow.org/ogg.xml', 'http://www.opennewsshow.org/mp3.xml', 'http://www.opennewsshow.org/mp3.xml', 'http://lottalinuxlinks.com/podcast/uclugogg.xml', 'http://lottalinuxlinks.com/podcast/uclugogg.xml', 'http://www.linuxworld.com/podcasts/linux/index.xml', 'http://www.linuxworld.com/podcasts/linux/index.xml', 'http://www.thebadapples.info/fedorareloaded/ogg.xml', 'http://www.thebadapples.info/fedorareloaded/ogg.xml', 'http://www.gutsygeeks.com/audio/podcast.xml.php', 'http://www.gutsygeeks.com/audio/podcast.xml.php', 'http://handheldheroes.net/rssOGG.xml', 'http://handheldheroes.net/rssOGG.xml', 'http://handheldheroes.net/rss.xml', 'http://handheldheroes.net/rss.xml', 'http://www.eff.org/rss/podcast/ogg.xml', 'http://www.eff.org/rss/podcast/ogg.xml', 'http://www.eff.org/rss/podcast/mp3.xml', 'http://www.eff.org/rss/podcast/mp3.xml', 'http://linuxcranks.info/ogg.xml', 'http://linuxcranks.info/ogg.xml', 'http://linuxvoid.technographer.net/soundfeed.xml', 'http://linuxvoid.technographer.net/soundfeed.xml', 'http://titradio.info/tit.xml', 'http://titradio.info/tit.xml', 'http://fossgeek.com/feeds/rss-ogg-full.xml', 'http://fossgeek.com/feeds/rss-ogg-full.xml', 'http://fossgeek.com/feeds/rss-mp3-full.xml', 'http://fossgeek.com/feeds/rss-mp3-full.xml', 'http://podcasts.jonmasters.org/kernel/kernel.xml', 'http://podcasts.jonmasters.org/kernel/kernel.xml', 'http://www.somethingkindatechy.org/pcg/feed.xml', 'http://www.somethingkindatechy.org/pcg/feed.xml', 'http://linuxgeekdom.com/rssogg.xml', 'http://linuxgeekdom.com/rssogg.xml', 'http://linuxgeekdom.com/rssmp3.xml', 'http://linuxgeekdom.com/rssmp3.xml', 'http://lottalinuxlinks.com/podcast/call-in.xml', 'http://lottalinuxlinks.com/podcast/call-in.xml', 'http://www.slugak.net/rss.xml', 'http://www.slugak.net/rss.xml', 'http://qskcast.info/netcasts/ogg/rss.xml', 'http://qskcast.info/netcasts/ogg/rss.xml', 'http://feeds.feedburner.com/HackRadioLive?format=xml', 'http://feeds.feedburner.com/HackRadioLive?format=xml', 'http://mikecosma.podomatic.com/rss2.xml', 'http://mikecosma.podomatic.com/rss2.xml', 'http://bsd.linuxbasix.com/feeds/regexorcist_ogg.xml', 'http://bsd.linuxbasix.com/feeds/regexorcist_ogg.xml']
def serial():
for url in LIST_OF_URLS:
parsed = feedparser.parse(url)
if parsed.entries:
print 'Found entry:', parsed.entries[0]
def parallel_with_twisted():
from twisted.internet import reactor
import twisted.internet.defer
import twisted.web.client
def handleResponse(parsed_feed):
parsed = feedparser.parse(parsed_feed)
if parsed.entries:
print 'Found entry:', parsed.entries[0]
semaphore = twisted.internet.defer.DeferredSemaphore(4)
dl = twisted.internet.defer.DeferredList([
semaphore.run(twisted.web.client.getPage, url).addBoth(handleResponse)
for url in LIST_OF_URLS])
dl.addBoth(lambda x: reactor.stop())
reactor.run()
def parallel_with_gevent():
import gevent.monkey
gevent.monkey.patch_all()
from gevent.pool import Pool
# limit ourselves to max 10 simultaneous outstanding requests
pool = Pool(10)
def handle_one_url(url):
parsed = feedparser.parse(url)
if parsed.entries:
print 'Found entry:', parsed.entries[0]
for url in LIST_OF_URLS:
pool.spawn(handle_one_url, url)
pool.join()
def parallel_with_multiprocessing():
def handle_one_url(url):
parsed = feedparser.parse(url)
if parsed.entries:
print 'Found entry:', parsed.entries[0]
pool = multiprocessing.Pool(processes=4)
result = [pool.apply_async(handle_one_url(url,))
for url in LIST_OF_URLS]
[i.get() for i in result]
def parallel_via_requests_async():
def handle_one_response(r):
parsed = feedparser.parse(r.content)
if parsed.entries:
print 'Found entry:', parsed.entries[0]
import requests.async
list_of_async_requests = [
requests.async.get(url,
hooks={'response': handle_one_response})
for url in LIST_OF_URLS]
requests.async.map(list_of_async_requests, size=5)
if __name__ == '__main__':
parallel_via_requests_async()