import requests
import pyMySQL
import time
import re
import xlwt
from lxml import etree
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.92 Safari/537.36',
'Cookie': 'gr_user_id = c6f58a39 - ea25 - 4f58 - b448 - 545070192c4e;59a81cc7d8c04307ba183d331c373ef6_gr_session_id = e8e4b66f - 440a - 4ae7 - a76a - fe2dd2b34a26;59a81cc7d8c04307ba183d331c373ef6_gr_last_sent_sid_with_cs1 = e8e4b66f - 440a - 4ae7 - a76a - fe2dd2b34a26;59a81cc7d8c04307ba183d331c373ef6_gr_last_sent_cs1 = N % 2FA;59a81cc7d8c04307ba183d331c373ef6_gr_session_id_e8e4b66f - 440a - 4ae7 - a76a - fe2dd2b34a26 = true;grwng_uid = 9ec14ad9 - 5ac0 - 4bb1 - 81c1 - bc60d2685710;abtest_ABTest4SearchDate = b;xzuuid = 79426b52;_uab_collina = 154660443606130958890473;TY_SESSION_ID = 907f32df - c060 - 49ca - b945 - 98215cc03475;rule_math = pvzq3r06hi'}
conn = pymysql.connect(host= 'localhost',user= 'root',passwd='momiao5201314',db='doubanmovie',port=3306,charset='utf8')
cursor = conn.cursor() #創(chuàng)建光標(biāo)對(duì)象
'''
# 創(chuàng)建一個(gè)workbook設(shè)置編碼
workbook = xlwt.Workbook(encoding = 'utf-8')
# 創(chuàng)建一個(gè)worksheet
worksheet = workbook.add_sheet('My Worksheet')
#定義表頭
header = ['movie_name','director','actors,style','country','release_time','time','score']
for h in range(len(header)):
workbook.write(0,h,header[h])
'''
def get_movie_url(url):
html = requests.get(url,headers=headers)
selector = etree.HTML(html.text)
movie_urls = selector.xpath('//div[@class="hd"]/a/@href')
for movie_url in movie_urls:
#print(movie_url)
get_movie_info(movie_url)
def get_movie_info(url):
html = requests.get(url,headers=headers)
selector = etree.HTML(html.text)
try:
movie_name = selector.xpath('//*[@id="content"]/h2/span[1]/text()') #1電影名稱
#print(movie_name)
director = selector.xpath('//*[@id="info"]/span[1]/span[2]/a/text()') #2導(dǎo)演
#print(director)
actors = selector.xpath('//*[@id="info"]/span[3]/span[2]')[0] #Xpath疑問?
actor = actors.xpath('string(.)') #3演員
#print(actor)
style = re.findall('(.*?)',html.text,re.S)[0] + re.findall('(.*?)',html.text,re.S)[1] #4類型
#print(style)
country = re.findall('制片國(guó)家/地區(qū):(.*?)
',html.text,re.S) #5制片地區(qū)
#print(country)
release_time = re.findall('上映日期:.*?>(.*?)',html.text,re.S) #6上映時(shí)間
#print(release_time)
time = re.findall('片長(zhǎng):.*?>(.*?)',html.text,re.S) #7片長(zhǎng)
#print(time)
score = selector.xpath('//*[@id="interest_sectl"]/div[1]/div[2]/strong/text()') #8評(píng)分
#print(score)
print(str(movie_name))
#sql = 'insert into doubanmovie(name,director,actor,style,country,release_time,time,score,) values("{}","{}","{}","{}","{}","{}","{}","{}")'.format(movie_name,director,actor,style,country,release_time,time,score) #多一個(gè)逗號(hào)
cursor.execute("insert into doubanmovie(name,director,actor,style,country,release_time,time,score) values(%s,%s,%s,%s,%s,%s,%s,%s)",(str(movie_name),str(director),str(actor),str(style),str(country),str(release_time),str(time),str(score)))
except IndexError:
pass
if __name__ == '__main__':
urls = ['https://movie.douban.com/top250?start={}&filter='.format(num)for num in range(0,250,25)]
for url in urls:
get_movie_url(url)
time.sleep(2)
conn.commit()
十余年的桂林網(wǎng)站建設(shè)經(jīng)驗(yàn),針對(duì)設(shè)計(jì)、前端、開發(fā)、售后、文案、推廣等六對(duì)一服務(wù),響應(yīng)快,48小時(shí)及時(shí)工作處理。成都全網(wǎng)營(yíng)銷推廣的優(yōu)勢(shì)是能夠根據(jù)用戶設(shè)備顯示端的尺寸不同,自動(dòng)調(diào)整桂林建站的顯示方式,使網(wǎng)站能夠適用不同顯示終端,在瀏覽器中調(diào)整網(wǎng)站的寬度,無(wú)論在任何一種瀏覽器上瀏覽網(wǎng)站,都能展現(xiàn)優(yōu)雅布局與設(shè)計(jì),從而大程度地提升瀏覽體驗(yàn)。成都創(chuàng)新互聯(lián)公司從事“桂林網(wǎng)站設(shè)計(jì)”,“桂林網(wǎng)站推廣”以來(lái),每個(gè)客戶項(xiàng)目都認(rèn)真落實(shí)執(zhí)行。