按學科分類【中圖分類】
我們提供的服務(wù)有:成都做網(wǎng)站、網(wǎng)站建設(shè)、微信公眾號開發(fā)、網(wǎng)站優(yōu)化、網(wǎng)站認證、化州ssl等。為超過千家企事業(yè)單位解決了網(wǎng)站和推廣的問題。提供周到的售前咨詢和貼心的售后服務(wù),是有科學管理、有技術(shù)的化州網(wǎng)站制作公司
共計三十余萬條科技報告數(shù)據(jù)
爬取的網(wǎng)址:https://www.nstrs.cn/kjbg/navigation
!??!
如果要完整地跑起來代碼,需要先看一下我的這篇博客,完成IP代理池的相關(guān)配置:
https://www.cnblogs.com/rainbow-1/p/.html
!??!
分析網(wǎng)站數(shù)據(jù)來源可以發(fā)現(xiàn),是使用的post方式的請求,且參數(shù)列表如下:
那么我們需要做的就是模擬這個請求,同時需要帶上我們自定義的參數(shù),這里面需要的其實一個就是頁碼pageNo,另一個是分類,如下圖:
parms = {
"pageNo": i,
"competentOrg": "",
"jihuaId": "",
"fieldCode": "",
"classification": name, # 修改
"kjbgRegion": "",
"kjbgType": "",
"grade": ""
}
簡單說一下我都做了什么,首先是配置是IP代理池,存在redis數(shù)據(jù)庫,每次【設(shè)置了隨機延遲時間】隨機取出一個進行訪問。
其次使用了隨機UserAgent請求頭。
爬蟲是直接使用post請求,攜帶參數(shù)抓獲返回的json數(shù)據(jù)做解析并存入mysql數(shù)據(jù)庫。
下面是代碼:
也就是你需要直接運行的方法。
我這部分是從"社會科學總論"這個分類開始爬的,前面那些如果需要爬,就直接改pageList頁碼列表、nameList名稱列表、tableList數(shù)據(jù)庫表列表就可以【切記是一 一對應(yīng)的!】
import json
import random
from time import sleep
import requests
from fake_useragent import UserAgent
from report_data.into_mysql import insert_mysql
from report_data.ip_redis import my_redis
"""
post方法參數(shù)
params:字典或字節(jié)序列,作為參數(shù)增加到鏈接中
data:字典,字節(jié)序列或文件對象,作為請求的內(nèi)容
json:JSON格式的數(shù)據(jù),作為Request的內(nèi)容
headers:字典,HTTP定制頭(模擬瀏覽器進行訪問)
cookies:字典或CpplieJar,Request中的cookie
auth:元祖,支持HTTP認證功能
files:字典類型,傳輸文件
timeout:設(shè)定超時時間,秒為單位
proxies:字典類型,設(shè)定訪問代理服務(wù)器,可以增加登陸認證
allow_redirects:True//False,默認為True,重定向開關(guān)
stream:True/False,默認為True,獲取內(nèi)容立即下載開關(guān)
verify:True/False,默認為True,認證SSL證書開關(guān)
cert:本地SSL證書路徑
"""
# 頁碼pageList
# 分類名稱參數(shù)列表 nameList
#
def get_report(page,name,tableName):
# ------------------------------ 修改頁碼
for i in range(1,page):
print("---------------------------------")
ua = UserAgent()
print("【隨機 UserAgent:】" + ua.random) # 隨機產(chǎn)生headers
temp_headers = ua.random
# --------------------------------------
test_redis = my_redis()
temp_proxy = test_redis.get_ip()
print("【隨機 IP:】" + temp_proxy)
url="https://www.nstrs.cn/rest/kjbg/wfKjbg/list"
# url2 = "https://www.nstrs.cn/rest/kjbg/wfKjbg/list?pageNo=2&competentOrg=&jihuaId=&fieldCode=&classification=醫(yī)藥、衛(wèi)生&kjbgRegion=&kjbgType=&grade="
parms = {
"pageNo": i,
"competentOrg": "",
"jihuaId": "",
"fieldCode": "",
"classification": name, # 修改
"kjbgRegion": "",
"kjbgType": "",
"grade": ""
}
other_parms={
'User-Agent': temp_headers,
'https': 'http://'+temp_proxy,
'http': 'http://'+temp_proxy
}
sleeptime = random.uniform(0, 0.7)
sleep(sleeptime)
# print(url)
response = requests.post(url, parms, other_parms)
response.encoding='utf8'
print(response.text+'\n')
response_data = response.text # 返回數(shù)據(jù)
json_data = json.loads(response_data) # 封裝字典
res_list_data = json_data['RESULT']['list'] # 一頁 長度為10的list [{ },{ },{ } ... { }] len=10
"""
重新構(gòu)建一個 list [{ }]
"""
for item in res_list_data:
insert_mysql(item,name,tableName)
return
if __name__ == '__main__':
# 頁碼 pageList []
pageList = [788,779,656,584,573,510,440,361,
315,226,224,220,155,112,112,
87,53,50,39,33,18,12,5,4,2,2,2,2]
nameList = [
"社會科學總論",
"環(huán)境科學、安全科學",
"建筑科學",
"輕工業(yè)、手工業(yè)",
"數(shù)理科學與化學",
"能源與動力工程",
"電工技術(shù)",
"礦業(yè)工程",
"經(jīng)濟",
"文化、科學、教育、體育",
"水利工程",
"交通運輸",
"自然科學總論",
"石油、天然氣工業(yè)",
"冶金工業(yè)",
"武器工業(yè)",
"航空、航天",
"哲學、宗教",
"原子能技術(shù)",
"歷史、地理",
"政治、法律",
"藝術(shù)",
"語言、文字",
"軍事",
"綜合性圖書",
"文學",
"語言、文學",
"mks主義、ln主義、mzd思想、dxp理論"
]
tableList = ["tech_c","tech_x","tech_tu","tech_ts","tech_o","tech_tk","tech_tm",
"tech_td","tech_f","tech_g","tech_tv","tech_u",
"tech_n","tech_te","tech_tf","tech_tj","tech_v","tech_b","tech_tl",
"tech_k","tech_d","tech_j","tech_h","tech_e","tech_z","tech_i","tech_i","tech_a"]
for i in range(0,len(tableList)):
get_report(pageList[i],nameList[i],tableList[i])
返回一個中圖分類號對應(yīng)的名稱
# 用以返回中圖分類號
def get_code(key):
code_dict = {
"醫(yī)藥、衛(wèi)生":"R",
"一般工業(yè)技術(shù)":"TB",
"生物科學":"Q",
"數(shù)理科學和化學":"O",
"農(nóng)業(yè)科學":"S",
"工業(yè)技術(shù)":"T",
"自動化技術(shù)、計算機技術(shù)":"TP",
"天文學、地球科學":"P",
"無線電電子學、電信技術(shù)":"TN",
"金屬學與金屬工藝":"TG",
"機械、儀表工業(yè)":"TH",
"化學工業(yè)":"TQ",
"社會科學總論":"C",
"環(huán)境科學、安全科學":"X",
"建筑科學":"TU",
"輕工業(yè)、手工業(yè)":"TS",
"數(shù)理科學與化學":"O",
"能源與動力工程":"TK",
"電工技術(shù)":"TM",
"礦業(yè)工程":"TD",
"經(jīng)濟":"F",
"文化、科學、教育、體育":"G",
"水利工程":"TV",
"交通運輸":"U",
"自然科學總論":"N",
"石油、天然氣工業(yè)":"TE",
"冶金工業(yè)":"TF",
"武器工業(yè)":"TJ",
"航空、航天":"V",
"哲學、宗教":"B",
"原子能技術(shù)":"TL",
"歷史、地理":"K",
"政治、法律":"D",
"藝術(shù)":"J",
"語言、文字":"H",
"軍事":"E",
"綜合性圖書":"Z",
"文學":"I",
"語言、文學":"I",
"mks主義、ln主義、mzd思想、dxp理論":"A",
}
res = code_dict.get(key)
return res
if __name__ == '__main__':
data = get_code("工業(yè)技術(shù)")
print(data)
返回隨機headers
from fake_useragent import UserAgent # 下載:pip install fake-useragent
import requests
ua = UserAgent() # 實例化,需要聯(lián)網(wǎng)但是網(wǎng)站不太穩(wěn)定-可能耗時會長一些
print(ua.random) # 隨機產(chǎn)生
headers = {
'User-Agent': ua.random # 偽裝
}
# 請求
if __name__ == '__main__':
url = 'https://www.baidu.com/'
response = requests.get(url, headers=headers ,proxies={"http":"117.136.27.43"})
print(response.status_code)
從redis數(shù)據(jù)庫取出一個ip并返回(前3000個隨機一個,降序排列)
import random
import redis
class my_redis:
def get_ip(self):
r = redis.Redis(host='127.0.0.1', port=6379, db=0,decode_responses=True)
my_redis_data = r.zrange("proxies:universal",1,3000,True)
return random.choice(my_redis_data)
# print(len(my_redis_data))
if __name__ == '__main__':
test_redis=my_redis()
data=test_redis.get_ip()
print(data)
存入mysql數(shù)據(jù)庫的方法
#連接數(shù)據(jù)庫 獲取游標
import pymysql
from report_data.category import get_code
def get_conn():
"""
:return: 連接,游標
"""
# 創(chuàng)建連接
conn = pymysql.connect(host="127.0.0.1",
user="root",
password="reliable",
db="tech",
charset="utf8mb4")
# 創(chuàng)建游標
cursor = conn.cursor() # 執(zhí)行完畢返回的結(jié)果集默認以元組顯示
if ((conn != None) & (cursor != None)):
print("數(shù)據(jù)庫連接成功 ...")
else:
print("數(shù)據(jù)庫連接失??!")
return conn, cursor
#關(guān)閉數(shù)據(jù)庫連接和游標
def close_conn(conn, cursor):
if cursor:
cursor.close()
if conn:
conn.close()
return 1
# 數(shù)據(jù)表名
# 中圖分類名
def insert_mysql(data,name,tableName):
print(data['title'])
id=data['id']
title=data['title']
alternativeTitle=data['alternativeTitle']
creator=data['creator']
abstractEn=data['abstractEn']
keywordsEn=data['keywordsEn']
abstractCn=data['abstractCn']
keywordsCn=data['keywordsCn']
creatOrorganization=data['creatOrorganization']
prepareOrganization=data['prepareOrganization']
publicDate=data['publicDate']
createTime=data['createTime']
projectName=data['projectName']
competentOrg=data['competentOrg']
projectSubjectName=data['projectSubjectName']
projectSubjectId=data['projectSubjectId']
#------------------------------
classification=name # 修改
#------------------------------
classificationCode=get_code(classification) # 需要調(diào)用get_code(name)獲取
responsiblePerson = data['responsiblePerson']
supportChannel = data['supportChannel']
undertakeOrg = data['undertakeOrg']
kjbgSource = data['kjbgSource']
proposalDate = data['proposalDate']
submittedDate = data['submittedDate']
kjbgRegion = data['kjbgRegion']
collectionDate = data['collectionDate']
collectionNumber = data['collectionNumber']
fieldCode = data['fieldCode']
fieldId = data['fieldId']
kjbgQWAddress = data['kjbgQWAddress']
isNewRecord = data['isNewRecord']
sourceUrl = "https://www.nstrs.cn/kjbg/detail?id="+id # 需要自己拼 https://www.nstrs.cn/kjbg/detail?id=
conn, cursor = get_conn()
# ------------------------------ 修改表名
sql = "insert into `"+tableName+"` (id,title,alternativeTitle,creator,abstractEn," \
"keywordsEn,abstractCn,keywordsCn,creatOrorganization,prepareOrganization," \
"publicDate,createTime,projectName,competentOrg,projectSubjectName," \
"projectSubjectId,classification,classificationCode,responsiblePerson,supportChannel," \
"undertakeOrg,kjbgSource,proposalDate,submittedDate,kjbgRegion," \
"collectionDate,collectionNumber,fieldCode,fieldId,kjbgQWAddress," \
"isNewRecord,sourceUrl) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s" \
",%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
try:
try:
cursor.execute(sql, [id,title,alternativeTitle,creator,abstractEn,
keywordsEn,abstractCn,keywordsCn,creatOrorganization,prepareOrganization,
publicDate,createTime,projectName,competentOrg,projectSubjectName,
projectSubjectId,classification,classificationCode,responsiblePerson,supportChannel,
undertakeOrg,kjbgSource,proposalDate,submittedDate,kjbgRegion,
collectionDate,collectionNumber,fieldCode,fieldId,kjbgQWAddress,isNewRecord,sourceUrl])
except pymysql.err.IntegrityError:
print("主鍵沖突!")
conn.commit() # 提交事務(wù) update delete insert操作
except pymysql.err.IntegrityError:
print("error!")
finally:
close_conn(conn, cursor)
return 1
if __name__ == '__main__':
print()
最終爬取三十多萬條科技報告,按中圖分類建立了mysql數(shù)據(jù)表,分表存儲不同分類的數(shù)據(jù)。
【其中的數(shù)理科學和化學,數(shù)理科學與化學這兩個分類做了合并,合并為數(shù)理科學和化學類,屬O】
【語言、文學和文學做了合并,同屬 I 文學類】
附幾張結(jié)果圖:
最后說一下數(shù)據(jù)表結(jié)構(gòu):
/*
Navicat MySQL Data Transfer
Source Server : reliable
Source Server Version :
Source Host : localhost:3306
Source Database : tech
Target Server Type : MYSQL
Target Server Version :
File Encoding :
Date: 2022-09-24 13:54:05
*/
SET FOREIGN_KEY_CHECKS=0;
-- ----------------------------
-- Table structure for tech_o
-- ----------------------------
DROP TABLE IF EXISTS `tech_o`;
CREATE TABLE `tech_o` (
`id` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NOT NULL COMMENT 'ID',
`title` text CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci COMMENT '中文標題',
`alternativeTitle` text CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci COMMENT '英文標題',
`creator` text CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci COMMENT '作者',
`abstractEn` text CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci COMMENT '英文摘要',
`keywordsEn` text CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci COMMENT '英文關(guān)鍵字',
`abstractCn` text CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci COMMENT '中文摘要',
`keywordsCn` text CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci COMMENT '中文關(guān)鍵字',
`creatOrorganization` text CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci COMMENT '創(chuàng)建者組織',
`prepareOrganization` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci DEFAULT NULL COMMENT '預備組織',
`publicDate` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci DEFAULT NULL COMMENT '公布時間',
`createTime` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci DEFAULT NULL COMMENT '編制時間',
`projectName` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci DEFAULT NULL COMMENT '項目名稱',
`competentOrg` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci DEFAULT NULL COMMENT '項目地址',
`projectSubjectName` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci DEFAULT NULL COMMENT '項目主題名稱',
`projectSubjectId` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci DEFAULT NULL COMMENT '項目主題ID',
`classification` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci DEFAULT NULL COMMENT '中圖分類名稱',
`classificationCode` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci DEFAULT NULL COMMENT '中圖分類號',
`responsiblePerson` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci DEFAULT NULL COMMENT '負責人',
`supportChannel` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci DEFAULT NULL COMMENT '主辦方',
`undertakeOrg` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci DEFAULT NULL COMMENT '承辦方',
`kjbgSource` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci DEFAULT NULL COMMENT '科技報告來源單位',
`proposalDate` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci DEFAULT NULL COMMENT '提議時間',
`submittedDate` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci DEFAULT NULL COMMENT '提交時間',
`kjbgRegion` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci DEFAULT NULL COMMENT '科技報告所屬行政區(qū)劃',
`collectionDate` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci DEFAULT NULL COMMENT '收集時間',
`collectionNumber` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci DEFAULT NULL COMMENT '收集編號',
`fieldCode` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci DEFAULT NULL COMMENT '領(lǐng)域代碼',
`fieldId` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci DEFAULT NULL COMMENT '領(lǐng)域ID',
`kjbgQWAddress` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci DEFAULT NULL COMMENT '報告鏈接',
`isNewRecord` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci DEFAULT NULL COMMENT '是否新記錄',
`sourceUrl` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci DEFAULT NULL COMMENT '國家科技報告服務(wù)系統(tǒng)收錄鏈接',
PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
后續(xù)數(shù)據(jù)處理參考:科技報告數(shù)據(jù)語料處理(關(guān)鍵詞、中圖分類名稱)【https://www.cnblogs.com/rainbow-1/p/.html】
如果需要獲取這部分數(shù)據(jù),可關(guān)注我的微信公眾號【靠譜楊的挨踢生活】,回復 “科技報告” 獲取下載鏈接。