本篇文章給大家分享的是有關(guān)怎么在python中使用xpath爬取網(wǎng)上數(shù)據(jù),小編覺(jué)得挺實(shí)用的,因此分享給大家學(xué)習(xí),希望大家閱讀完這篇文章后可以有所收獲,話不多說(shuō),跟著小編一起來(lái)看看吧。
發(fā)展壯大離不開(kāi)廣大客戶長(zhǎng)期以來(lái)的信賴與支持,我們將始終秉承“誠(chéng)信為本、服務(wù)至上”的服務(wù)理念,堅(jiān)持“二合一”的優(yōu)良服務(wù)模式,真誠(chéng)服務(wù)每家企業(yè),認(rèn)真做好每個(gè)細(xì)節(jié),不斷完善自我,成就企業(yè),實(shí)現(xiàn)共贏。行業(yè)涉及石雕等,在網(wǎng)站建設(shè)、營(yíng)銷型網(wǎng)站建設(shè)、WAP手機(jī)網(wǎng)站、VI設(shè)計(jì)、軟件開(kāi)發(fā)等項(xiàng)目上具有豐富的設(shè)計(jì)經(jīng)驗(yàn)。1.設(shè)計(jì)數(shù)據(jù)庫(kù)
from django.db import models from uuslug import slugify import uuid import os def products_directory_path(instance, filename): ext = filename.split('.')[-1] filename = '{}.{}'.format(uuid.uuid4().hex[:8], ext) # return the whole path to the file return os.path.join('images', "products", instance.title, filename) def product_relatedimage_directory_path(instance, filename): ext = filename.split('.')[-1] filename = '{}.{}'.format(uuid.uuid4().hex[:8], ext) # return the whole path to the file return os.path.join('images', "product_relatedimage", instance.product.title, filename) class ProductsCategory(models.Model): """產(chǎn)品分類""" name = models.CharField('產(chǎn)品分類名', max_length=80, unique=True) description = models.TextField('產(chǎn)品分類描述', blank=True, null=True) slug = models.SlugField('slug', max_length=80, blank=True, null=True) parent_category = models.ForeignKey('self', verbose_name="父級(jí)分類", blank=True, null=True, on_delete=models.CASCADE) def save(self, *args, **kwargs): if not self.id or not self.slug: self.slug = slugify(self.name) super().save(*args, **kwargs) def __str__(self): return self.name class Meta: ordering = ['name'] verbose_name = "產(chǎn)品分類" verbose_name_plural = verbose_name class ProductsTag(models.Model): """產(chǎn)品標(biāo)簽""" name = models.CharField('產(chǎn)品標(biāo)簽名', max_length=30, unique=True) slug = models.SlugField('slug', max_length=40) def __str__(self): return self.name def save(self, *args, **kwargs): if not self.id or not self.slug: self.slug = slugify(self.name) super().save(*args, **kwargs) class Meta: ordering = ['name'] verbose_name = "產(chǎn)品標(biāo)簽" verbose_name_plural = verbose_name class Product(models.Model): title = models.CharField('標(biāo)題', max_length=255, unique=True) slug = models.SlugField('slug', max_length=255, blank=True, null=True) jscs = models.TextField('技術(shù)參數(shù)', blank=True, null=True) image = models.ImageField(upload_to=products_directory_path, verbose_name="產(chǎn)品圖片") views = models.PositiveIntegerField('瀏覽量', default=0) category = models.ForeignKey('ProductsCategory', verbose_name='分類', on_delete=models.CASCADE, blank=True, null=True) tags = models.ManyToManyField('ProductsTag', verbose_name='標(biāo)簽集合', blank=True) def save(self, *args, **kwargs): if not self.id or not self.slug: self.slug = slugify(self.title) super().save(*args, **kwargs) def update_views(self): self.views += 1 self.save(update_fields=['views']) def get_pre(self): return Product.objects.filter(id__lt=self.id).order_by('-id').first() def get_next(self): return Product.objects.filter(id__gt=self.id).order_by('id').first() def __str__(self): return self.title class Meta: verbose_name = "產(chǎn)品" verbose_name_plural = verbose_name class ProductAdvantage(models.Model): content = models.TextField('產(chǎn)品優(yōu)勢(shì)', blank=True, null=True) product = models.ForeignKey(Product, on_delete=models.CASCADE, blank=True, null=True) def __str__(self): return self.content class Meta: verbose_name = "產(chǎn)品優(yōu)勢(shì)" verbose_name_plural = verbose_name class ProductBody(models.Model): body = models.CharField('產(chǎn)品內(nèi)容', max_length=256, blank=True, null=True) product = models.ForeignKey(Product, on_delete=models.CASCADE, blank=True, null=True) def __str__(self): return self.product.title class Meta: verbose_name = "產(chǎn)品內(nèi)容" verbose_name_plural = verbose_name
2.1編寫獲取網(wǎng)頁(yè)源代碼函數(shù)
def get_one_page(url): try: headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"} res = requests.get(url=url, headers=headers) res.encoding = 'utf-8' if res.status_code == 200: return res.text else: return None except Exception: return None
2.2根據(jù)base頁(yè)面獲取所有產(chǎn)品分類頁(yè)面鏈接
if __name__ == '__main__': content = get_one_page(url) tree = etree.HTML(content) # 產(chǎn)品分類url catgory_urls = tree.xpath('//div[@class="fdh-01-nav"]/div/h4/a/@href') # 處理catgory_urls for url in catgory_urls: url = 'http://www.kexinjianji.com' + url print(url)
2.3根據(jù)產(chǎn)品分類頁(yè)面鏈接獲取對(duì)應(yīng)所有產(chǎn)品鏈接
if __name__ == '__main__': content = get_one_page(url) tree = etree.HTML(content) # 產(chǎn)品分類 catgory = tree.xpath('//div[@class="cplb-3n-ts-03 b"]/h4/span/text()') print("產(chǎn)品分類:" + catgory[0]) # 該分類下產(chǎn)品url urls = tree.xpath('//div[@class="cplb-3n-ts-03-list"]/dl/dt/a/@href') # 處理url for url in urls: url = 'http://www.kexinjianji.com' + url print(url) print("=====================================================")
兩者結(jié)合起來(lái)就可以打印出所有產(chǎn)品鏈接
if __name__ == '__main__': content = get_one_page(url) tree = etree.HTML(content) # 產(chǎn)品分類url catgory_urls = tree.xpath('//div[@class="fdh-01-nav"]/div/h4/a/@href') # 處理catgory_urls for url in catgory_urls: url = 'http://www.kexinjianji.com' + url content = get_one_page(url) tree = etree.HTML(content) # 產(chǎn)品分類 catgory = tree.xpath('//div[@class="cplb-3n-ts-03 b"]/h4/span/text()') print("產(chǎn)品分類:" + catgory[0]) # 該分類下產(chǎn)品url urls = tree.xpath('//div[@class="cplb-3n-ts-03-list"]/dl/dt/a/@href') # 處理url for url in urls: url = 'http://www.kexinjianji.com' + url print(url) print("=====================================================")
2.2使用xpath解析函數(shù)返回產(chǎn)品鏈接的內(nèi)容
if __name__ == '__main__': content = get_one_page(url) tree = etree.HTML(content) # 產(chǎn)品名稱 title = tree.xpath('//*[@id="wrap"]//h2/text()') images = tree.xpath('//div[@class="sol_tj_left"]/a/img/@src') # 產(chǎn)品圖片 images_url = 'http://www.kexinjianji.com/' + images[0] # 性能特點(diǎn) xntd = tree.xpath('//div[@class="w"]//div/span/text()|//div[@class="w"]//div/text()') # 技術(shù)參數(shù) jscs = tree.xpath('//table')[0] jscs_str = etree.tostring(jscs, encoding='utf-8').decode('utf-8') # 產(chǎn)品內(nèi)容 cpnr = tree.xpath('//div[@class="describe"]/p') print('產(chǎn)品名稱:' + title[0]) print('產(chǎn)品圖片:' + images_url) for td in xntd: print('性能特點(diǎn):' + td) print('技術(shù)參數(shù):' + jscs_str) for cp in cpnr: # string(.) 獲取當(dāng)前標(biāo)簽下所有文本內(nèi)容 cp = cp.xpath('string(.)') print('產(chǎn)品內(nèi)容:' + cp) print('============================================')
將三者結(jié)合在一起就可以獲取所有產(chǎn)品信息
if __name__ == '__main__': content = get_one_page(url) tree = etree.HTML(content) # 產(chǎn)品分類url catgory_urls = tree.xpath('//div[@class="fdh-01-nav"]/div/h4/a/@href') # 處理catgory_urls for url in catgory_urls: url = 'http://www.kexinjianji.com' + url content = get_one_page(url) tree = etree.HTML(content) # 產(chǎn)品分類 catgory = tree.xpath('//div[@class="cplb-3n-ts-03 b"]/h4/span/text()') # 該分類下產(chǎn)品url urls = tree.xpath('//div[@class="cplb-3n-ts-03-list"]/dl/dt/a/@href') # 處理url for url in urls: url = 'http://www.kexinjianji.com' + url content = get_one_page(url) try: tree = etree.HTML(content) # 產(chǎn)品名稱 title = tree.xpath('//*[@id="wrap"]//h2/text()') images = tree.xpath('//div[@class="sol_tj_left"]/a/img/@src') # 產(chǎn)品圖片 images_url = 'http://www.kexinjianji.com' + images[0] # 性能特點(diǎn) xntd = tree.xpath('//div[@class="w"]//div/span/text()|//div[@class="w"]//div/text()') # 技術(shù)參數(shù) jscs = tree.xpath('//table')[0] jscs_str = etree.tostring(jscs, encoding='utf-8').decode('utf-8') # 產(chǎn)品內(nèi)容 cpnr = tree.xpath('//div[@class="describe"]/p') print("產(chǎn)品分類:" + catgory[0]) print('產(chǎn)品鏈接:' + url) print('產(chǎn)品名稱:' + title[0]) print('產(chǎn)品圖片:' + images_url) for td in xntd: print('性能特點(diǎn):' + td.strip()) # print('技術(shù)參數(shù):' + jscs_str) for cp in cpnr: # string(.) 獲取當(dāng)前標(biāo)簽下所有文本內(nèi)容 cp = cp.xpath('string(.)') print('產(chǎn)品內(nèi)容:' + cp) print('============================================') except Exception as e: print(e) print('出錯(cuò)url:' + url) pass
import requests from lxml.html import etree import os import django import uuid from django.core.files.base import ContentFile os.environ.setdefault("DJANGO_SETTINGS_MODULE", "jiaobanzhan.settings") django.setup() from products.models import ProductBody, ProductsCategory, Product, ProductAdvantage url = '/tupian/20230522/& def get_one_page(url): try: headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"} res = requests.get(url=url, headers=headers, timeout=10) res.encoding = 'utf-8' if res.status_code == 200: return res.text else: return None except Exception: print('aa') return None if __name__ == '__main__': content = get_one_page(url) tree = etree.HTML(content) # 產(chǎn)品分類url catgory_urls = tree.xpath('//div[@class="fdh-01-nav"]/div/h4/a/@href') # 處理catgory_urls for url in catgory_urls: url = 'http://www.kexinjianji.com' + url content = get_one_page(url) tree = etree.HTML(content) # 產(chǎn)品分類 p_catgory = tree.xpath('//div[@class="cplb-3n-ts-03 b"]/h4/span/text()') # 該分類下產(chǎn)品url urls = tree.xpath('//div[@class="cplb-3n-ts-03-list"]/dl/dt/a/@href') # 處理url for url in urls: url = 'http://www.kexinjianji.com' + url content = get_one_page(url) try: tree = etree.HTML(content) # 產(chǎn)品名稱 title = tree.xpath('//*[@id="wrap"]//h2/text()') images = tree.xpath('//div[@class="sol_tj_left"]/a/img/@src') # 產(chǎn)品圖片 images_url = 'http://www.kexinjianji.com' + images[0] # 性能特點(diǎn) xntd = tree.xpath('//div[@class="w"]//div/span/text()|//div[@class="w"]//div/text()') # 技術(shù)參數(shù) jscs = tree.xpath('//table')[0] jscs_str = etree.tostring(jscs, encoding='utf-8').decode('utf-8') # 產(chǎn)品內(nèi)容 cpnr = tree.xpath('//div[@class="describe"]/p') # 判斷是否有這分類,沒(méi)有則新建 catgory = p_catgory[0] products_catgory = ProductsCategory.objects.filter(name=catgory).exists() if products_catgory: products_catgory = ProductsCategory.objects.get(name=catgory) else: products_catgory = ProductsCategory(name=catgory) products_catgory.save() print(products_catgory) # 保存產(chǎn)品圖片 image_content = requests.get(url=images_url) ext = images_url.split('.')[-1] # 獲取圖片類型 filename = '{}.{}'.format(uuid.uuid4().hex[:8], ext) # 隨機(jī)生成圖片名字 upload_image_file = ContentFile(image_content.content, name=filename) # 將圖片保存為django類型 product = Product(title=title[0], jscs=jscs_str, image=upload_image_file, category=products_catgory) product.save() for td in xntd: product_advantage = ProductAdvantage() product_advantage.content = td product_advantage.product = product product_advantage.save() for cp in cpnr: cp = cp.xpath('string(.)') product_body = ProductBody() product_body.body = cp product_body.product = product product_body.save() except Exception as e: print(e) print('出錯(cuò)url:' + url)
最后自己手動(dòng)處理出錯(cuò)url(頁(yè)面沒(méi)有獲取到技術(shù)參數(shù),技術(shù)參數(shù)是一張圖片)
1.xpath 獲取標(biāo)簽內(nèi)容時(shí),p標(biāo)簽中嵌套span標(biāo)簽,源碼如下
板 寬:1500mm
板 厚:4.5 mm
出料口:6口
重 量:6000 kg
使用xpath獲取p標(biāo)簽內(nèi)容
我想得到的效果如下
板 寬:1500mm
板 厚:4.5 mm
出料口:6口
重 量:6000 kg
使用以下xpath 只能分開(kāi)獲取,不是想要的效果
//div[@class="describe"]/p/span/text()|//div[@class="describe"]/p/text()
百度之后找到的解決辦法,使用xpath(‘string(.)')
1.先獲取所有p標(biāo)簽
cpnr = tree.xpath('//div[@class="describe"]/p')
2.使用**string(.)**獲取所有標(biāo)簽所有文本
cp = cp.xpath('string(.)')
以上就是怎么在python中使用xpath爬取網(wǎng)上數(shù)據(jù),小編相信有部分知識(shí)點(diǎn)可能是我們?nèi)粘9ぷ鲿?huì)見(jiàn)到或用到的。希望你能通過(guò)這篇文章學(xué)到更多知識(shí)。更多詳情敬請(qǐng)關(guān)注創(chuàng)新互聯(lián)行業(yè)資訊頻道。