Git Product home page Git Product logo

douban's Introduction

douban

python抓取豆瓣电影Top250数据并写入Excel

1.将目标网站上的页面抓取下来

2.将抓取下来的数据根据一定的规则进行提取

create table `tb_movie`(
    id int(11) primary key auto_increment,
    title varchar(255),
    score varchar(255),
    date varchar(255),
    region varchar(255),
    category varchar(255),
    directorAndActor varchar(255),
    quote varchar(255),
    thumbnail varchar(255)
)ENGINE=InnoDB DEFAULT CHARSET=utf8;
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Author  : nacker [email protected]
# @Time    : 2020/3/4 6:25 下午
# @Site    : 
# @File    : douban.py
# @Software: PyCharm

# 1.将目标网站上的页面抓取下来
# 2.将抓取下来的数据根据一定的规则进行提取
import time
import requests
import xlwt
from lxml import etree
from MysqlHelper import MysqlHelper

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36',
    'Referer': 'https://www.douban.com/',
}

def get_page_source(url):
    '''
    获取网页源代码
    '''
    try:
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            text = response.text
            return etree.HTML(text)
    except requests.ConnectionError:
        return None

def get_movie_info(html):
    '''
    获取其中一页的数据
    '''
    if len(html):
        movies = []
        # item节点
        items = html.xpath("//div[@class='item']")
        for index,item in items:
            # 1.标题
            title = item.xpath("..//@alt")[0]
            # print(title)
            # 2.分数 rating_num
            score = item.xpath('..//span[@class="rating_num"]/text()')[0]
            # print(score)
            # 3.描述
            duration = ""
            # 4.时间+地区+类别
            tempString = item.xpath('div[@class="bd"]/p[@class=""]/text()')[1]
            tempString = "".join(tempString.split())
            temps = tempString.split('/')
            # print(temps)
            date = temps[0]
            # print(year)
            region = temps[1]
            # print(region)
            category = temps[2]
            # 5.导演+演员
            directorAndActorStr = item.xpath('div[@class="bd"]/p[@class=""]/text()')[0]
            directorAndActorStr = "".join(directorAndActorStr.split())
            # print(directorAndActorStr)
            directorAndActor = directorAndActorStr
            # 6.图片
            thumbnail = item.xpath("..//@src")[0]
            # print(thumbnail)
            # 7.quote
            quote = item.xpath('..//p[@class="quote"]/span/text()')
            if quote:
                quote = quote[0]
            else:
                quote = ""
            # print(quote)
            movie = {
                'title': title,
                'score': score,
                'date' : date,
                'region': region,
                'category': category,
                'directorAndActor': directorAndActor,
                'quote' : quote,
                'thumbnail': thumbnail
            }
            movies.append(movie)
        return movies

def write_movie(list):
    '''
    写入数据到数据库
    '''
    for dict in list:
        sql = 'insert into tb_movie(title,score,date,region,category,directorAndActor,quote,thumbnail) values(%s,%s,%s,%s,%s,%s,%s,%s)'
        mysqlHelper = MysqlHelper('localhost', 3306, 'douban', 'root', '123456')
        params = [dict['title'],dict['score'],dict['date'],dict['region'],dict['category'],dict['directorAndActor'],dict['quote'],dict['thumbnail']]
        # print(params)
        count = mysqlHelper.insert(sql, params)
        if count == 1:
            print(dict)
            print('--------------------ok--------------------')
        else:
            print('--------------------error--------------------')

def save_excel(datalist):
    '''
    将相关数据写入excel中
    '''
    # 初始化Excel
    w = xlwt.Workbook()
    style = xlwt.XFStyle()  # 初始化样式
    font = xlwt.Font()  # 为样式创建字体
    font.name = u"微软雅黑"
    style.font = font  # 为样式设置字体
    ws = w.add_sheet(u"豆瓣电影Top250", cell_overwrite_ok=True)

    # 将 title 作为 Excel 的列名
    title = u"排行, 电影, 评分, 年份, 地区, 类别, 导演主演, 评价, 图片"
    title = title.split(",")
    for i in range(len(title)):
        ws.write(0, i, title[i], style)
    #  # 开始写入数据库查询到的数据
    for i in range(len(datalist)):
        row = datalist[i]
        for j in range(len(row)):
            if row[j]:
                item = row[j]
                ws.write(i + 1, j, item, style)

    # 写文件完成,开始保存xls文件
    path = '豆瓣电影Top250.xls'
    w.save(path)

def start_robot():
    '''
    开始爬虫
    '''
    # https: // movie.douban.com / top250?start = 0 & filter =
    for offset in range(0, 250, 25):
        url = 'https://movie.douban.com/top250?start=' + str(offset) +'&filter='
        item = get_page_source(url)
        # 获取单页数据
        list = get_movie_info(item)
        # 写数据到数据库
        write_movie(list)
        # 家里网络不好,休眠一会吧,省的被K了
        time.sleep(3)
    else:
        print('豆瓣top250的电影信息写入完毕')

def read_mysql_Data():
    sql = 'select * from tb_movie order by id asc'
    mysqlHelper = MysqlHelper('localhost', 3306, 'douban', 'root', '123456')
    datalist = mysqlHelper.get_all(sql)
    return datalist

def main():
    # 保存到数据库
    # start_robot()
    # 读取数据
    datalist = read_mysql_Data()
    # 写入Excel
    save_excel(datalist)

if __name__ == '__main__':
    main()

github 源码下载

douban's People

Contributors

nacker avatar

Stargazers

 avatar  avatar

Watchers

 avatar

Recommend Projects

  • React photo React

    A declarative, efficient, and flexible JavaScript library for building user interfaces.

  • Vue.js photo Vue.js

    🖖 Vue.js is a progressive, incrementally-adoptable JavaScript framework for building UI on the web.

  • Typescript photo Typescript

    TypeScript is a superset of JavaScript that compiles to clean JavaScript output.

  • TensorFlow photo TensorFlow

    An Open Source Machine Learning Framework for Everyone

  • Django photo Django

    The Web framework for perfectionists with deadlines.

  • D3 photo D3

    Bring data to life with SVG, Canvas and HTML. 📊📈🎉

Recommend Topics

  • javascript

    JavaScript (JS) is a lightweight interpreted programming language with first-class functions.

  • web

    Some thing interesting about web. New door for the world.

  • server

    A server is a program made to process requests and deliver data to clients.

  • Machine learning

    Machine learning is a way of modeling and interpreting data that allows a piece of software to respond intelligently.

  • Game

    Some thing interesting about game, make everyone happy.

Recommend Org

  • Facebook photo Facebook

    We are working to build community through open source technology. NB: members must have two-factor auth.

  • Microsoft photo Microsoft

    Open source projects and samples from Microsoft.

  • Google photo Google

    Google ❤️ Open Source for everyone.

  • D3 photo D3

    Data-Driven Documents codes.