scrapy抓取安居客数据

资源分类：Python/Python语言基础
发布人：房东的猫
文件大小：41943
文件格式：.zip
浏览次数：20
下载次数： 0
发布时间：9月5日

开通会员每日领积分！

8.5玩家评分(1人评分)

下载后可评

介绍评论  失效链接反馈

采用python scrapy抓取安居客数据

from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy import log
import re
from urllib import request
from fang.items import AnJuKeItem
import json
import time
import os

class AnJuKeMobileSpider(CrawlSpider):
name = "anjukemobile"
allowed_domains = ['m.anjuke.com']
start_urls = ['https://m.anjuke.com/sh/loupan/newajax/all/?q=&lat=0&lng=0&page=1']
rules = (Rule(LinkExtractor(allow='/loupan/newajax/all/?q=&lat=0&lng=0&page=\d '), follow=True),
# Rule(LinkExtractor(allow='/sh/loupan/p\d '), follow=True),
Rule(LinkExtractor(allow='/sh/loupan/\d /$'), callback="parse_main", follow=True),
Rule(LinkExtractor(allow='/sh/loupan/\d /params/'), callback="parse_params"),
Rule(LinkExtractor(allow='/sh/loupan/\d /xiangce/\d /$'), callback="parse_image"),
)

def parse_main(self, response):
log.msg(('down load url %s' % response.url), level=log.INFO)
print(response.url)
id = re.sub("\D", "", response.url)
try:
item = AnJuKeItem()
status = response.xpath('//*[@id="container"]/div[@class="lpbase"]/div[@class="lptitle"]/span/'
'em/text()').extract()
if status and "已售罄" not in status[0]:
item['status'] = status[0]
projects = response.xpath('//*[@id="container"]/div[@class="lpbase"]/div[@class="lptitle"]/span/'
'h1/text()').extract()
item['project_name'] = projects[0]
address = response.xpath('//*[@id="container"]/div[@class="lpbase"]/div[@class="lpinfo"]/'
'a/p[@class="g-overflow-third"]//text()').extract()
item['address'] = address[0].strip().replace('\xa0', '')
item['province'] = '上海市'
item['city'] = '上海市'
item['district'] = address[0].strip().replace('\xa0', '').split('-')[0]

print(projects[0])
print(item)
except Exception as error:
log.msg(error, level=log.ERROR)

def parse_params(self, response):
log.msg(('down load param %s' % response.url), level=log.INFO)
try:
item = AnJuKeItem()
status = response.xpath('//*[@id="main-page"]/div[@class="pcontainer"]/ul/li[2]/span'
'/text()').extract()
if status and "已售罄" not in status[0]:
item['status'] = status[0]
project_name = response.xpath('//*[@id="main-page"]/div[@class="pcontainer"]/h3[1]/text()').extract()
item['project_name'] = project_name[0]
delivery_time = response.xpath('//*[@id="main-page"]/div[@class="pcontainer"]/ul/li[4]/span'
'/text()').extract()
item['delivery_time'] = delivery_time[0]
item['unit_price'] = response.xpath('//*[@id="main-page"]/div[@class="pcontainer"]/ul[2]/li[1]/span'
'/text()').extract()[0]
renovation = response.xpath('//*[@id="main-page"]/div[@class="pcontainer"]/ul[3]/li[3]/label'
'/text()').extract()[0]
if "装修标准" in renovation:
item['renovation'] = response.xpath('//*[@id="main-page"]/div[@class="pcontainer"]/ul[3]/li[3]/span'
'/text()').extract()[0]
else:
item['renovation'] = None
print(item)
except Exception as error:
log.msg(response.url, level=log.ERROR)
print(response.url)

下载scrapy抓取安居客数据用户还喜欢

发表评论必须先登陆，您可以登陆或者注册新账号 !

scrapy抓取安居客数据

评论

作者专栏

编辑推荐