本篇文章給大家分享的是有關(guān)怎么在Python中使用Scrapy爬取網(wǎng)頁內(nèi)容,小編覺得挺實(shí)用的,因此分享給大家學(xué)習(xí),希望大家閱讀完這篇文章后可以有所收獲,話不多說,跟著小編一起來看看吧。
成都創(chuàng)新互聯(lián)公司主要從事成都網(wǎng)站建設(shè)、網(wǎng)站制作、網(wǎng)頁設(shè)計(jì)、企業(yè)做網(wǎng)站、公司建網(wǎng)站等業(yè)務(wù)。立足成都服務(wù)龍灣,10余年網(wǎng)站建設(shè)經(jīng)驗(yàn),價(jià)格優(yōu)惠、服務(wù)專業(yè),歡迎來電咨詢建站服務(wù):18982081108爬蟲主程序:
# -*- coding: utf-8 -*- import scrapy from scrapy.http import Request from zjf.FsmzItems import FsmzItem from scrapy.selector import Selector # 圈圈:情感生活 class MySpider(scrapy.Spider): #爬蟲名 name = "MySpider" #設(shè)定域名 allowed_domains = ["nvsheng.com"] #爬取地址 start_urls = [] #flag x = 0 #爬取方法 def parse(self, response): item = FsmzItem() sel = Selector(response) item['title'] = sel.xpath('//h2/text()').extract() item['text'] = sel.xpath('//*[@class="content"]/p/text()').extract() item['imags'] = sel.xpath('//div[@id="content"]/p/a/img/@src|//div[@id="content"]/p/img/@src').extract() if MySpider.x == 0: page_list = MySpider.getUrl(self,response) for page_single in page_list: yield Request(page_single) MySpider.x += 1 yield item #init: 動(dòng)態(tài)傳入?yún)?shù) #命令行傳參寫法: scrapy crawl MySpider -a start_url="http://some_url" def __init__(self,*args,**kwargs): super(MySpider,self).__init__(*args,**kwargs) self.start_urls = [kwargs.get('start_url')] def getUrl(self, response): url_list = [] select = Selector(response) page_list_tmp = select.xpath('//div[@class="viewnewpages"]/a[not(@class="next")]/@href').extract() for page_tmp in page_list_tmp: if page_tmp not in url_list: url_list.append("http://www.nvsheng.com/emotion/px/" + page_tmp) return url_list
PipeLines類
# -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html from zjf import settings import json,os,re,random import urllib.request import requests, json from requests_toolbelt.multipart.encoder import MultipartEncoder class MyPipeline(object): flag = 1 post_title = '' post_text = [] post_text_imageUrl_list = [] cs = [] user_id= '' def __init__(self): MyPipeline.user_id = MyPipeline.getRandomUser('37619,18441390,18441391') #process the data def process_item(self, item, spider): #獲取隨機(jī)user_id,模擬發(fā)帖 user_id = MyPipeline.user_id #獲取正文text_str_tmp text = item['text'] text_str_tmp = "" for str in text: text_str_tmp = text_str_tmp + str # print(text_str_tmp) #獲取標(biāo)題 if MyPipeline.flag == 1: title = item['title'] MyPipeline.post_title = MyPipeline.post_title + title[0] #保存并上傳圖片 text_insert_pic = '' text_insert_pic_w = '' text_insert_pic_h = '' for imag_url in item['imags']: img_name = imag_url.replace('/','').replace('.','').replace('|','').replace(':','') pic_dir = settings.IMAGES_STORE + '%s.jpg' %(img_name) urllib.request.urlretrieve(imag_url,pic_dir) #圖片上傳,返回json upload_img_result = MyPipeline.uploadImage(pic_dir,'image/jpeg') #獲取json中保存圖片路徑 text_insert_pic = upload_img_result['result']['image_url'] text_insert_pic_w = upload_img_result['result']['w'] text_insert_pic_h = upload_img_result['result']['h'] #拼接json if MyPipeline.flag == 1: cs_json = {"c":text_str_tmp,"i":"","w":text_insert_pic_w,"h":text_insert_pic_h} else: cs_json = {"c":text_str_tmp,"i":text_insert_pic,"w":text_insert_pic_w,"h":text_insert_pic_h} MyPipeline.cs.append(cs_json) MyPipeline.flag += 1 return item #spider開啟時(shí)被調(diào)用 def open_spider(self,spider): pass #sipder 關(guān)閉時(shí)被調(diào)用 def close_spider(self,spider): strcs = json.dumps(MyPipeline.cs) jsonData = {"apisign":"99ea3eda4b45549162c4a741d58baa60","user_id":MyPipeline.user_id,"gid":30,"t":MyPipeline.post_title,"cs":strcs} MyPipeline.uploadPost(jsonData) #上傳圖片 def uploadImage(img_path,content_type): "uploadImage functions" #UPLOAD_IMG_URL = "http://api.qa.douguo.net/robot/uploadpostimage" UPLOAD_IMG_URL = "http://api.douguo.net/robot/uploadpostimage" # 傳圖片 #imgPath = 'D:\pics\http___img_nvsheng_com_uploads_allimg_170119_18-1f1191g440_jpg.jpg' m = MultipartEncoder( # fields={'user_id': '192323', # 'images': ('filename', open(imgPath, 'rb'), 'image/JPEG')} fields={'user_id': MyPipeline.user_id, 'apisign':'99ea3eda4b45549162c4a741d58baa60', 'image': ('filename', open(img_path , 'rb'),'image/jpeg')} ) r = requests.post(UPLOAD_IMG_URL,data=m,headers={'Content-Type': m.content_type}) return r.json() def uploadPost(jsonData): CREATE_POST_URL = http://api.douguo.net/robot/uploadimagespost
reqPost = requests.post(CREATE_POST_URL,data=jsonData)
def getRandomUser(userStr): user_list = [] user_chooesd = '' for user_id in str(userStr).split(','): user_list.append(user_id) userId_idx = random.randint(1,len(user_list)) user_chooesd = user_list[userId_idx-1] return user_chooesd
字段保存Items類
# -*- coding: utf-8 -*- # Define here the models for your scraped items # # See documentation in: # http://doc.scrapy.org/en/latest/topics/items.html import scrapy class FsmzItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() title = scrapy.Field() #tutor = scrapy.Field() #strongText = scrapy.Field() text = scrapy.Field() imags = scrapy.Field()
在命令行里鍵入
scrapy crawl MySpider -a start_url=www.aaa.com
以上就是怎么在Python中使用Scrapy爬取網(wǎng)頁內(nèi)容,小編相信有部分知識(shí)點(diǎn)可能是我們?nèi)粘9ぷ鲿?huì)見到或用到的。希望你能通過這篇文章學(xué)到更多知識(shí)。更多詳情敬請(qǐng)關(guān)注創(chuàng)新互聯(lián)成都網(wǎng)站設(shè)計(jì)公司行業(yè)資訊頻道。
另外有需要云服務(wù)器可以了解下創(chuàng)新互聯(lián)scvps.cn,海內(nèi)外云服務(wù)器15元起步,三天無理由+7*72小時(shí)售后在線,公司持有idc許可證,提供“云服務(wù)器、裸金屬服務(wù)器、高防服務(wù)器、香港服務(wù)器、美國(guó)服務(wù)器、虛擬主機(jī)、免備案服務(wù)器”等云主機(jī)租用服務(wù)以及企業(yè)上云的綜合解決方案,具有“安全穩(wěn)定、簡(jiǎn)單易用、服務(wù)可用性高、性價(jià)比高”等特點(diǎn)與優(yōu)勢(shì),專為企業(yè)上云打造定制,能夠滿足用戶豐富、多元化的應(yīng)用場(chǎng)景需求。