這篇文章給大家分享的是有關(guān)python爬蟲(chóng)如何獲取淘寶天貓商品詳細(xì)參數(shù)的內(nèi)容。小編覺(jué)得挺實(shí)用的,因此分享給大家做個(gè)參考,一起跟隨小編過(guò)來(lái)看看吧。
創(chuàng)新互聯(lián)是一家專(zhuān)業(yè)的網(wǎng)站制作公司,提供的服務(wù)包括:
品牌網(wǎng)站建設(shè),網(wǎng)站設(shè)計(jì),網(wǎng)頁(yè)設(shè)計(jì),我們是一家專(zhuān)業(yè)網(wǎng)站建設(shè)公司,做網(wǎng)站,我們是認(rèn)真的。我們是成都網(wǎng)站制作,成都網(wǎng)站建設(shè)的先行者,一切以客戶(hù)的利益為設(shè)計(jì)方向,能夠?yàn)椴煌袠I(yè)的客戶(hù)提供全面、長(zhǎng)期、深入的網(wǎng)絡(luò)解決方案。 創(chuàng)新互聯(lián)根據(jù)客戶(hù)的具體需求,提供從策劃、創(chuàng)意、制作、執(zhí)行等服務(wù)。
首先我是從淘寶進(jìn)去,爬取了按銷(xiāo)量排序的所有(100頁(yè))女裝的列表信息按綜合、銷(xiāo)量分別爬取淘寶女裝列表信息,然后導(dǎo)出前100商品的 link,爬取其詳細(xì)信息。這些商品有淘寶的,也有天貓的,這兩個(gè)平臺(tái)有些區(qū)別,處理的時(shí)候要注意。比如,有的說(shuō)“面料”、有的說(shuō)“材質(zhì)成分”,其實(shí)是一個(gè)意思,等等??梢匀〔煌逆溄幼鲆幌聹y(cè)試。
import re
from collections import OrderedDict
from bs4 import BeautifulSoup
from pyquery import PyQuery as pq #獲取整個(gè)網(wǎng)頁(yè)的源代碼
from config import * #可引用congif的所有變量
import pymysql
import urllib
import json
import bs4
import requests
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from pyquery import PyQuery as pq #獲取整個(gè)網(wǎng)頁(yè)的源代碼
import pandas as pd
# 測(cè)試 淘寶+天貓,可完整輸出及保存
browser = webdriver.Firefox()
wait = WebDriverWait(browser,10)
####### 天貓上半部分詳情 #############
def get_tianmao_header(url):
browser.get(url)
# wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR,'#mainsrp-itemlist .items .item'))) #加載所有寶貝
html=browser.page_source
doc = pq(html)
# print(doc)
info = OrderedDict() # 存放該商品所具有的全部信息
items = doc('#page')
# info['店鋪名'] = items.find('.slogo').find('.slogo-shopname').text()
# info['ID'] = items.find('#LineZing').attr['itemid']
info['寶貝'] = items.find('.tb-detail-hd').find('h2').text()
info['促銷(xiāo)價(jià)'] = items.find('#J_PromoPrice').find('.tm-promo-price').find('.tm-price').text()
info['原價(jià)'] = items.find('#J_StrPriceModBox').find('.tm-price').text()
# '月銷(xiāo)量' :items.find('.tm-ind-panel').find('.tm-ind-item tm-ind-sellCount').find('.tm-indcon').find('.tm-count').text(),
info['月銷(xiāo)量'] = items.find('.tm-ind-panel').find('.tm-indcon').find('.tm-count').text().split(' ',2)[0]
info['累計(jì)評(píng)價(jià)'] = items.find('#J_ItemRates').find('.tm-indcon').find('.tm-count').text()
# print(info)
return info
######## 淘寶上半部分詳情 ###############
def get_taobao_header(url):
browser.get(url)
# wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR,'#mainsrp-itemlist .items .item'))) #加載所有寶貝
html=browser.page_source
doc = pq(html)
# print(doc)
info = OrderedDict() # 存放該商品所具有的全部信息
items = doc('#page')
# info['店鋪名'] = items.find('.tb-shop-seller').find('.tb-seller-name').text()
# info['ID'] = items.find('#J_Pine').attr['data-itemid']
info['寶貝'] = items.find('#J_Title').find('h4').text()
info['原價(jià)'] = items.find('#J_StrPrice').find('.tb-rmb-num').text()
info['促銷(xiāo)價(jià)'] = items.find('#J_PromoPriceNum').text()
# '月銷(xiāo)量' :items.find('.tm-ind-panel').find('.tm-ind-item tm-ind-sellCount').find('.tm-indcon').find('.tm-count').text(),
info['月銷(xiāo)量'] = items.find('#J_SellCounter').text()
info['累計(jì)評(píng)價(jià)'] = items.find('#J_RateCounter').text()
# print(info)
return info
####################### 詳情 ############################
# 抓取所有商品詳情
def get_Details(attrs,info):
# res = requests.get(url)
# soup = BeautifulSoup(res.text, "html.parser")
#
# attrs = soup.select('.attributes-list li')
# attrs= [
厚薄: 薄,
材質(zhì)成分: 其他100%,
]
attrs_name = []
attrs_value = []
'''''
[\s] 匹配空格,[\s]*,后面有 *,則可以為空
* : 匹配前面的子表達(dá)式任意次
'''
for attr in attrs:
attrs_name.append(re.search(r'(.*?):[\s]*(.*)', attr.text).group(1))
attrs_value.append(re.search(r'(.*?):[\s]*(.*)', attr.text).group(2))
# print('attrs_name=',attrs_name) # attrs_name= ['厚薄', '材質(zhì)成分', ...]
# print('attrs_value=',attrs_value) # attrs_value= ['薄', '其他100%', ...]
allattrs = OrderedDict() # 存放該產(chǎn)品詳情頁(yè)面所具有的屬性
for k in range(0, len(attrs_name)):
allattrs[attrs_name[k]] = attrs_value[k]
# print('allattrs=',allattrs) # allattrs= OrderedDict([('厚薄', '薄'), ('材質(zhì)成分', '其他100%'),...])
# info = OrderedDict() # 存放該商品所具有的全部信息
# info = get_headdetail2(url)
# 下面三條語(yǔ)句獲取描述、服務(wù)、物流的評(píng)分信息
# 下面的語(yǔ)句用來(lái)判斷該商品具有哪些屬性,如果具有該屬性,將屬性值插入有序字典,否則,該屬性值為空
# 適用場(chǎng)景
if '材質(zhì)成分' in attrs_name:
info['材質(zhì)成分'] = allattrs['材質(zhì)成分']
elif '面料' in attrs_name:
info['材質(zhì)成分'] = allattrs['面料']
else:
info['材質(zhì)成分'] = 'NA'
# 適用對(duì)象
if '流行元素' in attrs_name:
info['流行元素'] = allattrs['流行元素']
else:
info['流行元素'] = 'NA'
#季節(jié)
if '年份季節(jié)' in attrs_name:
info['年份季節(jié)'] = allattrs['年份季節(jié)']
else:
info['年份季節(jié)'] = 'NA'
# 款式
if '袖長(zhǎng)' in attrs_name:
info['袖長(zhǎng)'] = allattrs['袖長(zhǎng)']
else:
info['袖長(zhǎng)'] = 'NA'
# 尺碼
if '銷(xiāo)售渠道類(lèi)型' in attrs_name:
info['銷(xiāo)售渠道類(lèi)型'] = allattrs['銷(xiāo)售渠道類(lèi)型']
else:
info['銷(xiāo)售渠道類(lèi)型'] = 'NA'
# 帽頂款式
if '貨號(hào)' in attrs_name:
info['貨號(hào)'] = allattrs['貨號(hào)']
else:
info['貨號(hào)'] = 'NA'
# 帽檐款式
if '服裝版型' in attrs_name:
info['服裝版型'] = allattrs['服裝版型']
else:
info['服裝版型'] = 'NA'
# 檐形
if '衣長(zhǎng)' in attrs_name:
info['衣長(zhǎng)'] = allattrs['衣長(zhǎng)']
else:
info['衣長(zhǎng)'] = 'NA'
# 主要材質(zhì)
if '領(lǐng)型' in attrs_name:
info['領(lǐng)型'] = allattrs['領(lǐng)型']
else:
info['領(lǐng)型'] = 'NA'
# 人群
if '袖型' in attrs_name:
info['袖型'] = allattrs['袖型']
else:
info['袖型'] = 'NA'
# 品牌
if '品牌' in attrs_name:
info['品牌'] = allattrs['品牌']
else:
info['品牌'] = 'NA'
# 風(fēng)格
if '圖案' in attrs_name:
info['圖案'] = allattrs['圖案']
elif '中老年女裝圖案' in attrs_name:
info['圖案'] = allattrs['中老年女裝圖案']
else:
info['圖案'] = 'NA'
# 款式細(xì)節(jié)
if '服裝款式細(xì)節(jié)' in attrs_name:
info['服裝款式細(xì)節(jié)'] = allattrs['服裝款式細(xì)節(jié)']
else:
info['服裝款式細(xì)節(jié)'] = 'NA'
# 適用年齡
if '適用年齡' in attrs_name:
info['適用年齡'] = allattrs['適用年齡']
else:
info['適用年齡'] = 'NA'
# 風(fēng)格
if '風(fēng)格' in attrs_name:
info['風(fēng)格'] = allattrs['風(fēng)格']
elif '中老年風(fēng)格' in attrs_name:
info['風(fēng)格'] = allattrs['中老年風(fēng)格']
else:
info['風(fēng)格'] = 'NA'
#通勤
if '通勤' in attrs_name:
info['通勤'] = allattrs['通勤']
else:
info['通勤'] = 'NA'
if '裙長(zhǎng)' in attrs_name:
info['裙長(zhǎng)'] = allattrs['裙長(zhǎng)']
else:
info['裙長(zhǎng)'] = 'NA'
if '裙型' in attrs_name:
info['裙型'] = allattrs['裙型']
else:
info['裙型'] = 'NA'
if '腰型' in attrs_name:
info['腰型'] = allattrs['腰型']
else:
info['腰型'] = 'NA'
# 顏色分類(lèi)
if '主要顏色' in attrs_name:
info['主要顏色'] = allattrs['主要顏色']
else:
info['主要顏色'] = 'NA'
if '顏色分類(lèi)' in attrs_name:
info['主要顏色'] = allattrs['顏色分類(lèi)']
else:
info['主要顏色'] = 'NA'
#尺碼
if '尺碼' in attrs_name:
info['尺碼'] = allattrs['尺碼']
else:
info['尺碼'] = 'NA'
if '組合形式' in attrs_name:
info['組合形式'] = allattrs['組合形式']
else:
info['組合形式'] = 'NA'
if '褲長(zhǎng)' in attrs_name:
info['褲長(zhǎng)'] = allattrs['褲長(zhǎng)']
else:
info['褲長(zhǎng)'] = 'NA'
return info
import csv
def main():
# 提取 列
with open('clothes_detai.csv', 'w', newline='', encoding='utf-8') as csvfile:
# fieldnames = ['店鋪ID','店鋪名','鏈接','寶貝','原價(jià)','促銷(xiāo)價(jià)','月銷(xiāo)量','累計(jì)評(píng)價(jià)','材質(zhì)成分','流行元素','袖長(zhǎng)','年份季節(jié)','銷(xiāo)售渠道類(lèi)型','貨號(hào)','服裝版型','衣長(zhǎng)','領(lǐng)型','袖型',
# '裙型','裙長(zhǎng)','腰型','褲長(zhǎng)','組合形式','品牌','圖案','服裝款式細(xì)節(jié)', '適用年齡','風(fēng)格','通勤','主要顏色','尺碼']
fieldnames=[ 'Link','Brand','Title','Price','Sale price','Sales','Evaluations',
'Component', 'Fashion elements','Sleeve','Seasons','Sales channels',
'Number','Clothes_Style','Long','Collar type','Sleeve type',
'Skirt type','Skirt length','Waist','Combining form','Outseam',
'Design','Fashion pattern detail','Applicable age',
'Style','Commuter','color','Size']
# 'Shop','Data_id','Shop_id','Shop','Link','Data_id',
writer = csv.DictWriter(csvfile, fieldnames = fieldnames)
writer.writeheader()
# urls = ['//detail.tmall.com/item.htm?spm=a230r.1.14.1.ebb2eb2eGyUw1&id=549177691667&ns=1&abbucket=4',
# '//item.taobao.com/item.htm?id=548443640333&ns=1&abbucket=0#detail']
f = pd.read_csv('women_clothes_sales2.csv')
urls = f['link'][0:100]
# sh = f['shop_id'][0:3]
# s = f['shop'][0:3]
# for url in urls:
# print(url)
# writer.writerow({'店鋪ID':f['shop_id'],'店鋪名':f['shop']})
keys, values = [], []
# for url in urls:
for i in urls:
url = 'http:' + i
# endswith 判斷字符串是否以指定的字符串結(jié)尾
if url.endswith('detail'):
info = get_taobao_header(url)
res = requests.get(url)
soup = BeautifulSoup(res.text, "html.parser")
attrs = soup.select('.attributes-list li') # 淘寶 class
else:
info = get_tianmao_header(url)
res = requests.get(url)
soup = BeautifulSoup(res.text, "html.parser")
attrs = soup.select('#J_AttrUL li') # 天貓 id
# print('attrs=',attrs)
d = get_Details(attrs,info)
print(d)
# for j in f[shop_id]:
# d['店鋪ID'] = j
# for s in f['shop']:
# d['店鋪名'] = s
#'Shop':d['店鋪名'],'Data_id':d['ID'],
writer.writerow({'Link':url,'Brand':d['品牌'],'Title':d['寶貝'], 'Price':d['原價(jià)'], 'Sale price':d['促銷(xiāo)價(jià)'], 'Sales':d['月銷(xiāo)量'], 'Evaluations':d['累計(jì)評(píng)價(jià)'],
'Component':d['材質(zhì)成分'], 'Fashion elements':d['流行元素'], 'Sleeve':d['袖長(zhǎng)'], 'Seasons':d['年份季節(jié)'], 'Sales channels':d['銷(xiāo)售渠道類(lèi)型'],
'Number':d['貨號(hào)'],'Clothes_Style':d['服裝版型'],'Long':d['衣長(zhǎng)'],'Collar type':d['領(lǐng)型'], 'Sleeve type':d['袖型'],
'Skirt type':d['裙型'], 'Skirt length':d['裙長(zhǎng)'], 'Waist':d['腰型'], 'Combining form':d['組合形式'], 'Outseam':d['褲長(zhǎng)'],
'Design':d['圖案'], 'Fashion pattern detail':d['服裝款式細(xì)節(jié)'], 'Applicable age':d['適用年齡'],
'Style':d['風(fēng)格'], 'Commuter':d['通勤'], 'color':d['主要顏色'], 'Size':d['尺碼']})
if __name__=='__main__':
main()python的數(shù)據(jù)類(lèi)型有哪些?python的數(shù)據(jù)類(lèi)型:1. 數(shù)字類(lèi)型,包括int(整型)、long(長(zhǎng)整型)和float(浮點(diǎn)型)。2.字符串,分別是str類(lèi)型和unicode類(lèi)型。3.布爾型,Python布爾類(lèi)型也是用于邏輯運(yùn)算,有兩個(gè)值:True(真)和False(假)。4.列表,列表是Python中使用最頻繁的數(shù)據(jù)類(lèi)型,集合中可以放任何數(shù)據(jù)類(lèi)型。5. 元組,元組用”()”標(biāo)識(shí),內(nèi)部元素用逗號(hào)隔開(kāi)。6. 字典,字典是一種鍵值對(duì)的集合。7. 集合,集合是一個(gè)無(wú)序的、不重復(fù)的數(shù)據(jù)組合。
感謝各位的閱讀!關(guān)于“python爬蟲(chóng)如何獲取淘寶天貓商品詳細(xì)參數(shù)”這篇文章就分享到這里了,希望以上內(nèi)容可以對(duì)大家有一定的幫助,讓大家可以學(xué)到更多知識(shí),如果覺(jué)得文章不錯(cuò),可以把它分享出去讓更多的人看到吧!
文章標(biāo)題:python爬蟲(chóng)如何獲取淘寶天貓商品詳細(xì)參數(shù)-創(chuàng)新互聯(lián)
分享地址:http://weahome.cn/article/epiio.html