本文實(shí)例講述了python實(shí)現(xiàn)的爬取電影下載鏈接功能。分享給大家供大家參考,具體如下:
創(chuàng)新互聯(lián)是一家網(wǎng)站制作、成都做網(wǎng)站,提供網(wǎng)頁(yè)設(shè)計(jì),網(wǎng)站設(shè)計(jì),網(wǎng)站制作,建網(wǎng)站,按需制作,網(wǎng)站開(kāi)發(fā)公司,2013年至今是互聯(lián)行業(yè)建設(shè)者,服務(wù)者。以提升客戶(hù)品牌價(jià)值為核心業(yè)務(wù),全程參與項(xiàng)目的網(wǎng)站策劃設(shè)計(jì)制作,前端開(kāi)發(fā),后臺(tái)程序制作以及后期項(xiàng)目運(yùn)營(yíng)并提出專(zhuān)業(yè)建議和思路。#!/usr/bin/python #coding=UTF-8 import sys import urllib2 import os import chardet from bs4 import BeautifulSoup reload(sys) sys.setdefaultencoding("utf-8") #從電影html頁(yè)面中獲取視頻下載地址 def get_movie_download_url(html): soup=BeautifulSoup(html,'html.parser') fixed_html=soup.prettify() td=soup.find('td',attrs={'style':'WORD-WRAP: break-word'}) url_a=td.find('a') url_a=url_a.string return url_a #從電影html頁(yè)面中獲取電影標(biāo)題 def get_movie_title(html): soup=BeautifulSoup(html,'html.parser') fixed_html=soup.prettify() title=soup.find('h2') title=title.string return title #訪(fǎng)問(wèn)url,返回html頁(yè)面 def get_html(url): req=urllib2.Request(url) req.add_header('User-Agent','Mozilla/5.0') response=urllib2.urlopen(url) html=response.read() return html #從電影列表頁(yè),獲取電影的url,拼接好,存到列表后返回 def get_movie_list(url): m_list = [] html = get_html(url) soup=BeautifulSoup(html,'html.parser') fixed_html=soup.prettify() a_urls=soup.find_all('a',attrs={'class':'ulink'}) host = "http://www.ygdy8.net" for a_url in a_urls: m_url=a_url.get('href') m_list.append(host+m_url) return m_list #存入txt文件 def file_edit(wr_str): f1 = open(r'e:\down_load_url.txt','a') f1.write(wr_str) f1.close() #傳入電影url的列表集合,獲取下載地址,并寫(xiě)入文件 def write_to_txt(a_urls): for a_url in a_urls: html=get_html(a_url) html=html.decode('GBK') write_title=get_movie_title(html) write_url=get_movie_download_url(html) file_edit(write_title+"\n") file_edit(write_url+"\n") file_edit("\n") #傳入頁(yè)數(shù),返回這幾頁(yè)的url列表 def get_pages_url(num): urls_list = [] url="http://www.ygdy8.net/html/gndy/dyzz/list_23_" for n in range(1,num+1): new_url = url+str(n)+".html" urls_list.append(new_url) return urls_list if __name__=='__main__': pages = 2 #打算爬取幾頁(yè)電影 p_url = get_pages_url(pages) for i in p_url: write_to_txt(get_movie_list(i))#執(zhí)行寫(xiě)入 print "done"