Meetup.com has an API, use it
pip install Scrapy
~/temp$ scrapy startproject meetup
~/temp$ find meetup/
meetup/
meetup/scrapy.cfg
meetup/meetup
meetup/meetup/settings.py
meetup/meetup/__init__.py
meetup/meetup/items.py
meetup/meetup/pipelines.py
meetup/meetup/spiders
meetup/meetup/spiders/__init__.py
from scrapy.item import Item, Field
class MeetupItem(Item):
title = Field()
link = Field()
description = Field()
from scrapy.spider import BaseSpider
from scrapy.selector import Selector
from meetup.items import MeetupItem
class MeetupSpider(BaseSpider):
name = "meetup"
allowed_domains = ["meetup.com"]
start_urls = [
"http://www.meetup.com/Search-Meetup-Karlsruhe/"
]
def parse(self, response):
responseSelector = Selector(response)
for sel in responseSelector.css('li.past.line.event-item'):
item = MeetupItem()
item['title'] = sel.css('a.event-title::text').extract()
item['link'] = sel.xpath('a/@href').extract()
yield item
scrapy crawl meetup -o talks.json
2014-07-24 18:27:59+0200 [scrapy] INFO: Scrapy 0.20.0 started (bot: meetup)
[...]
2014-07-24 18:28:00+0200 [meetup] DEBUG: Crawled (200) (referer: None)
2014-07-24 18:28:00+0200 [meetup] DEBUG: Scraped from <200 http://www.meetup.com/Search-Meetup-Karlsruhe/>
{'link': [u'http://www.meetup.com/Search-Meetup-Karlsruhe/events/178746832/'],
'title': [u'Neues in Elasticsearch 1.1 und Logstash in der Praxis']}
2014-07-24 18:28:00+0200 [meetup] DEBUG: Scraped from <200 http://www.meetup.com/Search-Meetup-Karlsruhe/>
{'link': [u'http://www.meetup.com/Search-Meetup-Karlsruhe/events/161417512/'],
'title': [u'Erstes Treffen mit Kurzvortr\xe4gen']}
2014-07-24 18:28:00+0200 [meetup] INFO: Closing spider (finished)
2014-07-24 18:28:00+0200 [meetup] INFO: Stored jsonlines feed (2 items) in: talks.json
2014-07-24 18:28:00+0200 [meetup] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 244,
'downloader/request_count': 1,
[...]
'start_time': datetime.datetime(2014, 7, 24, 16, 27, 59, 540300)}
2014-07-24 18:28:00+0200 [meetup] INFO: Spider closed (finished)
less talks.json
{"link": ["http://www.meetup.com/Search-Meetup-Karlsruhe/events/178746832/"],
"title": ["Neues in Elasticsearch 1.1 und Logstash in der Praxis"]}
{"link": ["http://www.meetup.com/Search-Meetup-Karlsruhe/events/161417512/"],
"title": ["Erstes Treffen mit Kurzvortr\u00e4gen"]}
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import Selector
from meetup.items import MeetupItem
class MeetupDetailSpider(CrawlSpider):
name = "meetupDetail"
allowed_domains = ["meetup.com"]
start_urls = ["http://www.meetup.com/Search-Meetup-Karlsruhe/"]
rules = [Rule(SgmlLinkExtractor(
restrict_xpaths=('//div[@id="recentMeetups"]//a[@class="event-title"]')),
callback='parse_meetup')]
def parse_meetup(self, response):
sel = Selector(response)
item = MeetupItem()
item['title'] = sel.xpath('//h1[@itemprop="name"]/text()').extract()
item['link'] = response.url
item['description'] = sel.xpath(
'//div[@id="past-event-description-wrap"]//text()').extract()
yield item
pip install "ScrapyElasticSearch"
ITEM_PIPELINES = [
'scrapyelasticsearch.ElasticSearchPipeline',
]
ELASTICSEARCH_SERVER = 'localhost'
ELASTICSEARCH_PORT = 9200
ELASTICSEARCH_INDEX = 'meetups'
ELASTICSEARCH_TYPE = 'meetup'
ELASTICSEARCH_UNIQ_KEY = 'link'