Comments (2)
- 在items.py文件里的CommentItem类中添加
img_url = Field()
img_name = Field()
- 在settings中添加
# 图片存储目录,可自行修改
IMAGES_STORE = 'images/'
ITEM_PIPELINES = {
'WeiboCrawler.pipelines.ImagesnamePipeline': 300,
}
- 在pipeline.py中添加
import re
from scrapy.pipelines.images import ImagesPipeline
from scrapy import Request
class ImagesnamePipeline(ImagesPipeline):
def get_media_requests(self, item, info):
if 'img_url' in item:
for image_url in item['img_url']:
# meta里面的数据是从spider获取,然后通过meta传递给下面方法:file_path
yield Request(image_url, meta={'name':item['img_name']}, dont_filter=True, headers={'Host': 'wx1.sinaimg.cn'})
def file_path(self, request, response=None, info=None):
# 提取url前面名称
image_guid = request.url.split('/')[-1]
# name为评论日期
name = request.meta['name']
name = re.sub(r'[?\\*|“<>:/]', '', name)
# 图片存储默认位置为:根目录/images/评论时间/评论时间_img_id.格式
filename = u'{0}/{0}_{1}'.format(name, image_guid)
return filename
- 将comment.py修改为
# -*- coding: utf-8 -*-
import re
import json
from scrapy import Request, Spider
from WeiboCrawler.items import CommentItem
from WeiboCrawler.spiders.utils import standardize_date
class CommentSpider(Spider):
name = 'comment'
base_url = 'https://api.weibo.cn/2/comments/build_comments?'
def start_requests(self):
mblog_ids = ['']
urls = [f'{self.base_url}is_show_bulletin=2&c=android&s=746fd605&id={mblog_id}&from=10A8195010&gsid=_2AkMolNMzf8NhqwJRmf4dxWzgb49zzQrEieKeyCLoJRM3HRl-wT9jqmwMtRV6AgOZP3LqGBH-29qGRB4vP3j-Hng6DkBJ&count=50&max_id_type=1' for mblog_id in mblog_ids]
for url in urls:
yield Request(url, callback=self.parse, dont_filter=True, headers={'Host': 'api.weibo.cn'})
def parse(self, response):
js = json.loads(response.text)
mblog_id = re.search(r'[\d]{16}', response.url).group(0)
comments = js['root_comments']
for comment in comments:
commentItem = CommentItem()
commentItem['created_at'] = standardize_date(comment['created_at']).strftime('%Y-%m-%d')
img_url = []
if 'pic_infos' in comment:
for pic in comment['pic_infos']:
img_url.append(comment['pic_infos'][pic]['original']['url'])
commentItem['img_url'] = img_url
commentItem['img_name'] = commentItem['created_at']
yield commentItem
max_id = js['max_id']
if max_id > 0:
max_id_type = js['max_id_type']
next_url = f'{self.base_url}is_show_bulletin=2&c=android&s=746fd605&id={mblog_id}&from=10A8195010&gsid=_2AkMolNMzf8NhqwJRmf4dxWzgb49zzQrEieKeyCLoJRM3HRl-wT9jqmwMtRV6AgOZP3LqGBH-29qGRB4vP3j-Hng6DkBJ&count=50&max_id={max_id}&max_id_type={max_id_type}'
yield Request(next_url, callback=self.parse, dont_filter=True, headers={'Host': 'api.weibo.cn'})
from weibocrawler.
大佬威武
from weibocrawler.
Related Issues (17)
- 应该怎么打开呀 HOT 1
- 安装依赖的时候出错误应该咋办呀 HOT 1
- 在转发爬取时遇到问题 HOT 6
- 大佬,运行时报这个错是什么原因?ImportError: cannot import name 'HTTPClientFactory' from 'twisted.web.client' (unknown location) HOT 1
- 转发爬取遇到的问题 HOT 1
- comment 爬虫中存在的问题 HOT 1
- 评论爬取问题
- comment二级评论字段 HOT 1
- cookie具体怎么添加呢
- 如何增加转发者或评论者昵称? HOT 1
- 超棒的项目!可不可以再加一个微博url到mid的转化呀 HOT 2
- 求新增直接写入CSV HOT 1
- 请问为什么更换了微博id之后只能抓微博下面的一条评论啊? HOT 2
- 数据的具体时间获取问题 HOT 4
- 爬取数据的存储问题 HOT 2
- 关键词查找问题 HOT 6
Recommend Projects
-
React
A declarative, efficient, and flexible JavaScript library for building user interfaces.
-
Vue.js
🖖 Vue.js is a progressive, incrementally-adoptable JavaScript framework for building UI on the web.
-
Typescript
TypeScript is a superset of JavaScript that compiles to clean JavaScript output.
-
TensorFlow
An Open Source Machine Learning Framework for Everyone
-
Django
The Web framework for perfectionists with deadlines.
-
Laravel
A PHP framework for web artisans
-
D3
Bring data to life with SVG, Canvas and HTML. 📊📈🎉
-
Recommend Topics
-
javascript
JavaScript (JS) is a lightweight interpreted programming language with first-class functions.
-
web
Some thing interesting about web. New door for the world.
-
server
A server is a program made to process requests and deliver data to clients.
-
Machine learning
Machine learning is a way of modeling and interpreting data that allows a piece of software to respond intelligently.
-
Visualization
Some thing interesting about visualization, use data art
-
Game
Some thing interesting about game, make everyone happy.
Recommend Org
-
Facebook
We are working to build community through open source technology. NB: members must have two-factor auth.
-
Microsoft
Open source projects and samples from Microsoft.
-
Google
Google ❤️ Open Source for everyone.
-
Alibaba
Alibaba Open Source for everyone
-
D3
Data-Driven Documents codes.
-
Tencent
China tencent open source team.
from weibocrawler.