找传奇、传世资源到传世资源站!

爬取头条美女图片(仅供参考思路,已不能用)

8.5玩家评分(1人评分)
下载后可评
介绍 评论 失效链接反馈

爬取头条美女图片,喜欢的可以拿走欣赏

import jsonimport osfrom urllib.parse import urlencodeimport pymongoimport requestsfrom bs4 import BeautifulSoupfrom requests.exceptions import ConnectionErrorimport refrom multiprocessing import Poolfrom hashlib import md5from json.decoder import JSONDecodeErrorfrom config import *client = pymongo.MongoClient(MONGO_URL, connect=False)db = client[MONGO_DB]def get_page_index(offset, keyword): data = { 'autoload': 'true', 'count': 20, 'cur_tab': 3, 'format': 'json', 'keyword': keyword, 'offset': offset, } params = urlencode(data) base = 'http://www.toutiao.com/search_content/' url = base '?' params try: response = requests.get(url) if response.status_code == 200: return response.text return None except ConnectionError: print('Error occurred') return Nonedef download_image(url): print('Downloading', url) try: response = requests.get(url) if response.status_code == 200: save_image(response.content) return None except ConnectionError: return Nonedef save_image(content): file_path = '{0}/{1}.{2}'.format(os.getcwd(), md5(content).hexdigest(), 'jpg') print(file_path) if not os.path.exists(file_path): with open(file_path, 'wb') as f: f.write(content) f.close()def parse_page_index(text): try: data = json.loads(text) if data and 'data' in data.keys(): for item in data.get('data'): yield item.get('article_url') except JSONDecodeError: passdef get_page_detail(url): try: response = requests.get(url) if response.status_code == 200: return response.text return None except ConnectionError: print('Error occurred') return Nonedef parse_page_detail(html, url): soup = BeautifulSoup(html, 'lxml') result = soup.select('title') title = result[0].get_text() if result else '' images_pattern = re.compile('gallery: JSON.parse\("(.*)"\)', re.S) result = re.search(images_pattern, html) if result: data = json.loads(result.group(1).replace('\\', '')) if data and 'sub_images' in data.keys(): sub_images = data.get('sub_images') images = [item.get('url') for item in sub_images] for image in images: download_image(image) return { 'title': title, 'url': url, 'images': images }def save_to_mongo(result): if db[MONGO_TABLE].insert(result): print('Successfully Saved to Mongo', result) return True return Falsedef main(offset): text = get_page_index(offset, KEYWORD) urls = parse_page_index(text) for url in urls: html = get_page_detail(url) result = parse_page_detail(html, url) if result: save_to_mongo(result)if __name__ == '__main__': pool = Pool() groups = ([x * 20 for x in range(GROUP_START, GROUP_END 1)]) pool.map(main, groups) pool.close() pool.join()

评论

发表评论必须先登陆, 您可以 登陆 或者 注册新账号 !


在线咨询: 问题反馈
客服QQ:174666394

有问题请留言,看到后及时答复