采用python scrapy抓取安居客数据
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy import log
import re
from urllib import request
from fang.items import AnJuKeItem
import json
import time
import os
class AnJuKeMobileSpider(CrawlSpider):
name = "anjukemobile"
allowed_domains = ['m.anjuke.com']
start_urls = ['https://m.anjuke.com/sh/loupan/newajax/all/?q=&lat=0&lng=0&page=1']
rules = (Rule(LinkExtractor(allow='/loupan/newajax/all/?q=&lat=0&lng=0&page=\d '), follow=True),
# Rule(LinkExtractor(allow='/sh/loupan/p\d '), follow=True),
Rule(LinkExtractor(allow='/sh/loupan/\d /$'), callback="parse_main", follow=True),
Rule(LinkExtractor(allow='/sh/loupan/\d /params/'), callback="parse_params"),
Rule(LinkExtractor(allow='/sh/loupan/\d /xiangce/\d /$'), callback="parse_image"),
)
def parse_main(self, response):
log.msg(('down load url %s' % response.url), level=log.INFO)
print(response.url)
id = re.sub("\D", "", response.url)
try:
item = AnJuKeItem()
status = response.xpath('//*[@id="container"]/div[@class="lpbase"]/div[@class="lptitle"]/span/'
'em/text()').extract()
if status and "已售罄" not in status[0]:
item['status'] = status[0]
projects = response.xpath('//*[@id="container"]/div[@class="lpbase"]/div[@class="lptitle"]/span/'
'h1/text()').extract()
item['project_name'] = projects[0]
address = response.xpath('//*[@id="container"]/div[@class="lpbase"]/div[@class="lpinfo"]/'
'a/p[@class="g-overflow-third"]//text()').extract()
item['address'] = address[0].strip().replace('\xa0', '')
item['province'] = '上海市'
item['city'] = '上海市'
item['district'] = address[0].strip().replace('\xa0', '').split('-')[0]
print(projects[0])
print(item)
except Exception as error:
log.msg(error, level=log.ERROR)
def parse_params(self, response):
log.msg(('down load param %s' % response.url), level=log.INFO)
try:
item = AnJuKeItem()
status = response.xpath('//*[@id="main-page"]/div[@class="pcontainer"]/ul/li[2]/span'
'/text()').extract()
if status and "已售罄" not in status[0]:
item['status'] = status[0]
project_name = response.xpath('//*[@id="main-page"]/div[@class="pcontainer"]/h3[1]/text()').extract()
item['project_name'] = project_name[0]
delivery_time = response.xpath('//*[@id="main-page"]/div[@class="pcontainer"]/ul/li[4]/span'
'/text()').extract()
item['delivery_time'] = delivery_time[0]
item['unit_price'] = response.xpath('//*[@id="main-page"]/div[@class="pcontainer"]/ul[2]/li[1]/span'
'/text()').extract()[0]
renovation = response.xpath('//*[@id="main-page"]/div[@class="pcontainer"]/ul[3]/li[3]/label'
'/text()').extract()[0]
if "装修标准" in renovation:
item['renovation'] = response.xpath('//*[@id="main-page"]/div[@class="pcontainer"]/ul[3]/li[3]/span'
'/text()').extract()[0]
else:
item['renovation'] = None
print(item)
except Exception as error:
log.msg(response.url, level=log.ERROR)
print(response.url)
评论