以下是爬取京東商品詳情的Python3代碼,以excel存放鏈接的方式批量爬取。excel如下
10多年的神池網(wǎng)站建設(shè)經(jīng)驗,針對設(shè)計、前端、開發(fā)、售后、文案、推廣等六對一服務(wù),響應(yīng)快,48小時及時工作處理。全網(wǎng)營銷推廣的優(yōu)勢是能夠根據(jù)用戶設(shè)備顯示端的尺寸不同,自動調(diào)整神池建站的顯示方式,使網(wǎng)站能夠適用不同顯示終端,在瀏覽器中調(diào)整網(wǎng)站的寬度,無論在任何一種瀏覽器上瀏覽網(wǎng)站,都能展現(xiàn)優(yōu)雅布局與設(shè)計,從而大程度地提升瀏覽體驗。創(chuàng)新互聯(lián)從事“神池網(wǎng)站設(shè)計”,“神池網(wǎng)站推廣”以來,每個客戶項目都認真落實執(zhí)行。
代碼如下
from selenium import webdriver from lxml import etree from selenium.webdriver.common.by import By from selenium.webdriver.common.keys import Keys import datetime import calendar import logging from logging import handlers import requests import os import time import pymssql import openpyxl import xlrd import codecs class EgongYePing: options= webdriver.FirefoxOptions() fp= webdriver.FirefoxProfile() fp.set_preference("browser.download.folderList",2) fp.set_preference("browser.download.manager.showWhenStarting",False) fp.set_preference("browser.helperApps.neverAsk.saveToDisk","application/zip,application/octet-stream") global driver driver= webdriver.Firefox(firefox_profile=fp,options=options) def Init(self,url,code): print(url.strip()) driver.get(url.strip()) #driver.refresh() # 操作瀏覽器屬于異步,在網(wǎng)絡(luò)出現(xiàn)問題的時候??赡艽a先執(zhí)行。但是請求頁面沒有應(yīng)答。所以硬等 time.sleep(int(3)) html= etree.HTML(driver.page_source) if driver.title!=None: listImg=html.xpath('//*[contains(@class,"spec-list")]//ul//li//img') if len(listImg)==0: pass if len(listImg)>0: imgSrc='' for item in range(len(listImg)): imgSrc='https://img14.360buyimg.com/n0/'+listImg[item].attrib["data-url"] print('頭圖下載:'+imgSrc) try: Headers= {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:50.0) Gecko/ Firefox/50.0'} r= requests.get(imgSrc, headers=Headers, stream=True) if r.status_code == 200: imgUrl='' if item==0: imgUrl+=code + "_主圖_" + str(item) + '.' + imgSrc.split('//')[1].split('/')[len(imgSrc.split('//')[1].split('/'))-1].split('.')[1] else: imgUrl+=code + "_附圖_" + str(item) + '.' + imgSrc.split('//')[1].split('/')[len(imgSrc.split('//')[1].split('/'))-1].split('.')[1] open(os.getcwd()+'/img/'+ imgUrl , 'wb').write(r.content) # 將內(nèi)容寫入圖片 del r except Exceptionas e: print("圖片禁止訪問:"+imgSrc) listImg=html.xpath('//*[contains(@class,"ssd-module")]') if len(listImg)==0: listImg=html.xpath('//*[contains(@id,"J-detail-content")]//div//div//p//img') if len(listImg)==0: listImg=html.xpath('//*[contains(@id,"J-detail-content")]//img') if len(listImg)>0: for index in range(len(listImg)): detailsHTML=listImg[index].attrib if 'data-id' in detailsHTML: try: details= driver.find_element_by_class_name("animate-"+listImg[index].attrib['data-id']).value_of_css_property('background-image') details=details.replace('url(' , ' ') details=details.replace(')' , ' ') newDetails=details.replace('"', ' ') details=newDetails.strip() print("詳情圖下載:"+details) try: Headers= {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:50.0) Gecko/ Firefox/50.0'} r= requests.get(details, headers=Headers, stream=True) if r.status_code == 200: imgUrl='' imgUrl+=code + "_詳情圖_" + str(index) + '.' + details.split('//')[1].split('/')[len(details.split('//')[1].split('/'))-1].split('.')[1] open(os.getcwd()+'/img/'+ imgUrl, 'wb').write(r.content) # 將內(nèi)容寫入圖片 del r except Exceptionas e: print("圖片禁止訪問:"+details) except Exceptionas e: print('其他格式的圖片不收錄'); if 'src' in detailsHTML: try: details= listImg[index].attrib['src'] if 'http' in details: pass else: details='https:'+details print("詳情圖下載:"+details) try: Headers= {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:50.0) Gecko/ Firefox/50.0'} r= requests.get(details, headers=Headers, stream=True) if r.status_code == 200: imgUrl='' imgUrl+=code + "_詳情圖_" + str(index) + '.' + details.split('//')[1].split('/')[len(details.split('//')[1].split('/'))-1].split('.')[1] open(os.getcwd()+'/img/'+ imgUrl, 'wb').write(r.content) # 將內(nèi)容寫入圖片 del r except Exceptionas e: print("圖片禁止訪問:"+details) except Exceptionas e: print('其他格式的圖片不收錄'); print('結(jié)束執(zhí)行') @staticmethod def readxlsx(inputText): filename=inputText inwb= openpyxl.load_workbook(filename) # 讀文件 sheetnames= inwb.get_sheet_names() # 獲取讀文件中所有的sheet,通過名字的方式 ws= inwb.get_sheet_by_name(sheetnames[0]) # 獲取第一個sheet內(nèi)容 # 獲取sheet的最大行數(shù)和列數(shù) rows= ws.max_row cols= ws.max_column for r in range(1,rows+1): for c in range(1,cols): if ws.cell(r,c).value!=None and r!=1 : if 'item.jd.com' in str(ws.cell(r,c+1).value) and str(ws.cell(r,c+1).value).find('i-item.jd.com')==-1: print('支持:'+str(ws.cell(r,c).value)+'|'+str(ws.cell(r,c+1).value)) EgongYePing().Init(str(ws.cell(r,c+1).value),str(ws.cell(r,c).value)) else: print('當前格式不支持:'+(str(ws.cell(r,c).value)+'|'+str(ws.cell(r,c+1).value))) pass pass if __name__ == "__main__": start= EgongYePing() start.readxlsx(r'C:\Users\newYear\Desktop\爬圖.xlsx')