爬取头条美女图片(仅供参考思路,已不能用)

资源分类：Python/常用Python方法
发布人：房东的猫
文件大小：2120
文件格式：.zip
浏览次数：22
下载次数： 0
发布时间：9月5日

标签：源码例子爬虫

开通会员每日领积分！

如何获取积分

0赞

8.5玩家评分(1人评分)

下载后可评

介绍评论  失效链接反馈

爬取头条美女图片，喜欢的可以拿走欣赏

import jsonimport osfrom urllib.parse import urlencodeimport pymongoimport requestsfrom bs4 import BeautifulSoupfrom requests.exceptions import ConnectionErrorimport refrom multiprocessing import Poolfrom hashlib import md5from json.decoder import JSONDecodeErrorfrom config import *client = pymongo.MongoClient(MONGO_URL, connect=False)db = client[MONGO_DB]def get_page_index(offset, keyword): data = { 'autoload': 'true', 'count': 20, 'cur_tab': 3, 'format': 'json', 'keyword': keyword, 'offset': offset, } params = urlencode(data) base = 'http://www.toutiao.com/search_content/' url = base '?' params try: response = requests.get(url) if response.status_code == 200: return response.text return None except ConnectionError: print('Error occurred') return Nonedef download_image(url): print('Downloading', url) try: response = requests.get(url) if response.status_code == 200: save_image(response.content) return None except ConnectionError: return Nonedef save_image(content): file_path = '{0}/{1}.{2}'.format(os.getcwd(), md5(content).hexdigest(), 'jpg') print(file_path) if not os.path.exists(file_path): with open(file_path, 'wb') as f: f.write(content) f.close()def parse_page_index(text): try: data = json.loads(text) if data and 'data' in data.keys(): for item in data.get('data'): yield item.get('article_url') except JSONDecodeError: passdef get_page_detail(url): try: response = requests.get(url) if response.status_code == 200: return response.text return None except ConnectionError: print('Error occurred') return Nonedef parse_page_detail(html, url): soup = BeautifulSoup(html, 'lxml') result = soup.select('title') title = result[0].get_text() if result else '' images_pattern = re.compile('gallery: JSON.parse\("(.*)"\)', re.S) result = re.search(images_pattern, html) if result: data = json.loads(result.group(1).replace('\\', '')) if data and 'sub_images' in data.keys(): sub_images = data.get('sub_images') images = [item.get('url') for item in sub_images] for image in images: download_image(image) return { 'title': title, 'url': url, 'images': images }def save_to_mongo(result): if db[MONGO_TABLE].insert(result): print('Successfully Saved to Mongo', result) return True return Falsedef main(offset): text = get_page_index(offset, KEYWORD) urls = parse_page_index(text) for url in urls: html = get_page_detail(url) result = parse_page_detail(html, url) if result: save_to_mongo(result)if __name__ == '__main__': pool = Pool() groups = ([x * 20 for x in range(GROUP_START, GROUP_END 1)]) pool.map(main, groups) pool.close() pool.join()

下载爬取头条美女图片(仅供参考思路,已不能用)用户还喜欢

发表评论必须先登陆，您可以登陆或者注册新账号 !

爬取头条美女图片(仅供参考思路,已不能用)

评论

作者专栏

编辑推荐