爬取某網(wǎng)站寫的python代碼-創(chuàng)新互聯(lián)

代碼如下：

創(chuàng)新互聯(lián)公司堅持“要么做到，要么別承諾”的工作理念，服務(wù)領(lǐng)域包括：成都網(wǎng)站設(shè)計、網(wǎng)站建設(shè)、企業(yè)官網(wǎng)、英文網(wǎng)站、手機(jī)端網(wǎng)站、網(wǎng)站推廣等服務(wù)，滿足客戶于互聯(lián)網(wǎng)時代的鹽津網(wǎng)站設(shè)計、移動媒體設(shè)計的需求，幫助企業(yè)找到有效的互聯(lián)網(wǎng)解決方案。努力成為您成熟可靠的網(wǎng)絡(luò)建設(shè)合作伙伴！

import requests
from pyquery import PyQuery
import re
import os
import csv
import datetime
"""
    說明：該代碼是專門為爬取http://www.kgtmall.com.cn/商品而設(shè)計的。
    使用方法：
        1、在本地提前安裝好python3的環(huán)境；
        2、直接運行本代碼；
        3、運行本代碼完后，會在當(dāng)前目錄生成一個result.csv文件，該文件里面就存了爬取該站點的商品信息
    注意事項：在本代碼運行期間，不能打開result.csv文件，因為這樣程序就寫不進(jìn)去數(shù)據(jù)了；只能等本代碼
            全部運行結(jié)束后，才能打開esult.csv文件進(jìn)行查看。
    
"""
def get_html_text(url):
    """
    獲取首頁源代碼
    :param url:
    :return:
    """
    r = requests.get(url)
    return r.text
def get_one_level_class(home_url):
    """
    一級標(biāo)題
        母嬰用品 http://www.kgtmall.com.cn/mall/list.php?catid=4
        生活家居 http://www.kgtmall.com.cn/mall/list.php?catid=5
    """
    html = get_html_text(home_url)
    jpy = PyQuery(html)
    items = jpy('.menu_title a')
    for line in items:
        jpy = PyQuery(line)
        one_level_url = jpy('a').attr('href')
        one_level_title = jpy('a').text()
        yield one_level_url, one_level_title
def get_two_level_class(home_url):
    """
    二級標(biāo)題
        母嬰用品 營養(yǎng)輔食 http://www.kgtmall.com.cn/mall/search.php?catid=539
        母嬰用品 媽媽專區(qū) http://www.kgtmall.com.cn/mall/search.php?catid=544
        母嬰用品 嬰兒保健 http://www.kgtmall.com.cn/mall/search.php?catid=887
    """
    for one_level_url, one_level_title in get_one_level_class(home_url):
        jpy = PyQuery(one_level_url)
        items = jpy('.selector_category li')
        for line in items:
            jpy = PyQuery(line)
            two_level_url = jpy('a').attr('href')
            two_level_title = jpy('a').text()
            yield one_level_title, two_level_title, two_level_url
def get_pages(url):
    """
    獲取頁數(shù)
    :return:
    """
    jpy = PyQuery(url)
    pages = jpy('.pagination cite').text()
    print('原pages：', pages)
    try:
        pages = int(re.findall('共.*?條/(.*)頁', pages)[0])
    except Exception as e:
        print(e)
        pages = 1
    print('頁碼：', pages)
    return pages
def get_three_level_class(home_url):
    """
    三級標(biāo)題
        母嬰用品 營養(yǎng)輔食 DHA http://www.kgtmall.com.cn/mall/search.php?catid=548
        母嬰用品 營養(yǎng)輔食 益生菌/初乳 http://www.kgtmall.com.cn/mall/search.php?catid=549
        母嬰用品 營養(yǎng)輔食 清火/開胃/驅(qū)蟲 http://www.kgtmall.com.cn/mall/search.php?catid=550
    """
    for one_level_title, two_level_title, two_level_url in get_two_level_class(home_url):
        jpy = PyQuery(two_level_url)
        items = jpy('.selector_category li')
        for line in items:
            jpy = PyQuery(line)
            three_level_title = jpy('a').text()
            three_level_url = jpy('a').attr('href')
            catid = re.findall('http://www.kgtmall.com.cn/mall/search.php\?catid=(.*)', three_level_url)[0]
            pages = get_pages(three_level_url)
            # for index in range(1, 3):
            for index in range(1, pages + 1):
                three_level_url_by_xiaoliang = 'http://www.kgtmall.com.cn/mall/search.php?kw=&list=0&catid={}&order=10&minprice=&maxprice=&page={}'.format(
                catid, index)
                yield one_level_title, two_level_title, three_level_title, three_level_url_by_xiaoliang
def shop_title_and_url(home_url):
    """
    商品標(biāo)題和url
        母嬰用品 營養(yǎng)輔食 DHA 澳洲直郵 澳大利亞RIFOLD 兒童DHA90粒（一月以上適用） http://www.kgtmall.com.cn/mall/show.php?itemid=28089
        母嬰用品 營養(yǎng)輔食 益生菌/初乳 澳大利亞 Maxigenes美可卓 全脂高鈣奶粉（藍(lán)胖子）1kg 兩罐裝 http://www.kgtmall.com.cn/mall/show.php?itemid=23486
    """
    for one_level_title, two_level_title, three_level_title, three_level_url_by_xiaoliang in get_three_level_class(home_url):
        jpy = PyQuery(three_level_url_by_xiaoliang)
        items = jpy('.list_img a')
        for line in items:
            jpy = PyQuery(line)
            shop_url = jpy('a').attr('href')
            shop_title = jpy('a img').attr('alt')
            yield one_level_title, two_level_title, three_level_title, shop_title, shop_url
def get_shop_info(home_url, count):
    for one_level_title, two_level_title, three_level_title, shop_title, shop_url in shop_title_and_url(home_url):
        print('--排錯：' + one_level_title, two_level_title, three_level_title, shop_title, shop_url)
        jpy = PyQuery(shop_url)
        price = jpy('.price').text()
        # 條形碼
        bar_code = jpy('.bar_code dl dd p').text()
        goods_detail = jpy('#content')
        try:
            guige = re.findall('規(guī)格：(.*)', goods_detail.text())[0]
        except:
            guige = '沒有規(guī)格'
        try:
            chandi = re.findall('產(chǎn)地：(.*)', goods_detail.text())[0]
        except:
            chandi = '沒有產(chǎn)地'
        print(count, one_level_title, two_level_title, three_level_title, shop_title,  bar_code, chandi, guige,  price, shop_url)
        row = ([one_level_title, two_level_title, three_level_title, shop_title,  bar_code, chandi, guige,  price, shop_url])
        ppath = os.path.dirname(__file__)
        csv_file = ppath + '/result.csv'
        # newline是為了解決csv文件里面有多余的空行，encoding是為了解決寫不進(jìn)csv數(shù)據(jù)報字符集的報錯
        with open(csv_file, 'a', newline='', encoding='utf-8') as f:
            writer = csv.writer(f)
            writer.writerow(row)
        count += 1
def main():
    # 記錄一下開始時間
    start_time = datetime.datetime.now()
    home_url = 'http://www.kgtmall.com.cn/'
    # 當(dāng)前代碼路徑
    ppath = os.path.dirname(__file__)
    csv_file = ppath + '/result.csv'
    headers = (['一級分類', '二級分類', '三級分類', '商品名稱', '條碼', '產(chǎn)地', '規(guī)格', '價格', '商品鏈接'])
    # newline是為了解決csv文件里面有多余的空行，encoding是為了解決寫不進(jìn)csv數(shù)據(jù)報字符集的報錯
    with open(csv_file, 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(headers)
    count = 1
    get_shop_info(home_url, 1)
    # 記錄一下結(jié)束時間
    end_time = datetime.datetime.now()
    # 記錄程序執(zhí)行用時
    timediff = end_time - start_time
    print('總共用時{}秒\n'.format(str(timediff.seconds)))
    print('全部商品已經(jīng)按需求完成?。?！')
if __name__ == '__main__':
    main()

運行后，會在當(dāng)前目錄下生成個result.csv文件，內(nèi)容如下：

爬取某網(wǎng)站寫的python代碼

當(dāng)前題目：爬取某網(wǎng)站寫的python代碼-創(chuàng)新互聯(lián)
轉(zhuǎn)載來于：http://weahome.cn/article/csjsej.html

真实的国产乱ⅩXXX66竹夫人,五月香六月婷婷激情综合,亚洲日本VA一区二区三区,亚洲精品一区二区三区麻豆

爬取某網(wǎng)站寫的python代碼-創(chuàng)新互聯(lián)

其他資訊

網(wǎng)站制作

企業(yè)服務(wù)

網(wǎng)站建設(shè)

服務(wù)器托管