# coding=utf-8
import requests
from bs4 import BeautifulSoup
import io
import sys
import os
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='gb') # 改變標(biāo)準(zhǔn)輸出的默認(rèn)編碼
for qq in range(8):
# query = input("【中證網(wǎng)】請輸入你想搜索的內(nèi)容:")
query = '蘇州銀行'
#年份
year = [2014,2015,2016,2017,2018,2019,2020,2021]
#總頁數(shù)
pages = [2,1,1,1,11,1,19,7]
year = year[qq]
pages = pages[qq]
if not os.path.isdir(f'D:/桌面/爬蟲-銀行/中國證券網(wǎng)/{query}'): # 如果沒有此文件夾
os.mkdir(f'D:/桌面/爬蟲-銀行/中國證券網(wǎng)/{query}') # 創(chuàng)建此文件夾
m = 0
for p in range(1, pages + 1):
url = f'http://search.cs.com.cn/search?page={p}&channelid=&searchword={query}&keyword={query}&token=12..47&perpage=10&outlinepage=5&&andsen=&total=&orsen=&exclude=&searchscope=×cope=×copecolumn=&orderby=&timeline=={year}'
dic = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36"}
resp = requests.get(url, headers=dic, )
resp.encoding = 'utf-8'
# print(resp)
print(f'\n>>>--------------------第{p}頁---------------------<<<\n')
print(f'\n>>>--------------------第{p}頁---------------------<<<\n')
print(f'\n>>>--------------------第{p}頁---------------------<<<\n')
# print(resp.text)
page = BeautifulSoup(resp.text, "html.parser") # 指定html解析器
alist = page.find_all("table")
datalist = []
for ii in alist:
ss=ii.find('td', style='font-size: 12px;line-height: 24px;color: #;margin-top: 4px;')
# print('ss=\n\n',ss)
if ss != None:
ss = ss.get_text()
datalist.append(ss)
# print('data:',datalist,len(datalist))
if not os.path.isdir(f'D:/桌面/爬蟲-銀行/中國證券網(wǎng)/{query}/{year}'): # 如果沒有此文件夾
os.mkdir(f'D:/桌面/爬蟲-銀行/中國證券網(wǎng)/{query}/{year}') # 創(chuàng)建此文件夾
for ii in range(len(datalist)):
fp = open(f'D:/桌面/爬蟲-銀行/中國證券網(wǎng)/{query}/{year}/({year}){ii + m + 1}.txt', 'w+', encoding='utf-8')
fp.write(datalist[ii] + '\n') # 只包含文本
print(datalist[ii])
print(f'\n> > >{year}年,第{p}頁,第{ii + 1}篇,成功! < < <')
fp.close()
m = m + len(datalist) + 1
print('----------------------------')
print(f'------\n{year}年,爬取完畢----')
print('----------------------------')
# coding=utf-8
import requests
from bs4 import BeautifulSoup
import io
import sys
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='gb') # 改變標(biāo)準(zhǔn)輸出的默認(rèn)編碼
query = input("【中證網(wǎng)】請輸入你想搜索的內(nèi)容:")
pages = int(input("要爬取的頁數(shù)(不小于1):"))
if pages < 1:
exit()
url = f'http://search.cs.com.cn/search?channelid=&perpage=&templet=&token=12..47&searchword={query}'
dic = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 "
"Safari/537.36 SLBrowser/7.0.0.6241 SLBChan/30"}
resp = requests.get(url, headers=dic, )
resp.encoding = 'utf-8'
# print(resp)
# print(resp.text)
page = BeautifulSoup(resp.text, "html.parser") # 指定html解析器
alist = page.find("table").find_all("a")
# print(alist)
weblist = []
for a in alist:
if a.get('href')[:5] == "https":
weblist.append(a.get('href'))
# ----------------單頁每個文章---------------------------------
m = 0
for ii in range(len(weblist)):
url_a = weblist[ii]
# print('0=',url_a)
dic_a = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 "
"Safari/537.36 SLBrowser/7.0.0.6241 SLBChan/30"}
resp_a = requests.get(url_a, headers=dic_a, )
resp_a.encoding = 'gbk'
# print('New:\n',resp_a.text)
page_a = BeautifulSoup(resp_a.text, "html.parser") # 指定html解析器
# print('123:\n',page_a)
page_b = page_a.find('section').find_all('p')
# print(page_b)
fp=open(f'D:/桌面/爬蟲-銀行/中國證券網(wǎng)/中國銀行/{ii+1}.txt','w+',encoding='utf-8')
txt_list = []
for txt_a in page_b:
# print(txt_a.text)
txt_list.append(txt_a.text)
# +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# ++++++++++++++++++++++文本寫入+++++++++++++++++++++++++++++++
for i in range(len(txt_list)):
fp.write(txt_list[i] + '\n') # 只包含文本
fp.close()
print(f'>>{ii+1}成功!')
m = ii+1
# +-+++-----------++++++++++-----多頁------++++++++++++----------++++
if pages > 1:
for p in range(pages):
url_s = f"http://search.cs.com.cn/search?page={p+1}&channelid=&searchword={query}"
resp = requests.get(url, headers=dic, )
resp.encoding = 'utf-8'
# print(resp)
# print(resp.text)
page = BeautifulSoup(resp.text, "html.parser") # 指定html解析器
alist = page.find("table").find_all("a")
# print(alist)
weblist = []
for a in alist:
if a.get('href')[:5] == "https":
weblist.append(a.get('href'))
# ----------------單頁每個文章---------------------------------
for ii in range(len(weblist)):
url_a = weblist[ii]
# print('0=',url_a)
dic_a = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 "
"Safari/537.36 SLBrowser/7.0.0.6241 SLBChan/30"}
resp_a = requests.get(url_a, headers=dic_a, )
resp_a.encoding = 'gbk'
# print('New:\n',resp_a.text)
page_a = BeautifulSoup(resp_a.text, "html.parser") # 指定html解析器
# print('123:\n',page_a)
page_b = page_a.find('section').find_all('p')
# print(page_b)
fp = open(f'D:/桌面/爬蟲-銀行/中國證券網(wǎng)/中國銀行/{ii + 1 + m}.txt', 'w+', encoding='utf-8')
txt_list = []
for txt_a in page_b:
# print(txt_a.text)
txt_list.append(txt_a.text)
# +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# ++++++++++++++++++++++文本寫入+++++++++++++++++++++++++++++++
for i in range(len(txt_list)):
fp.write(txt_list[i] + '\n') # 只包含文本
print(f'>>{ii + 1 + m}成功!')
m = m + ii + 1
fp.close()
print('---------------\n>>>爬取完畢<<<')
# coding=utf-8
import requests
from bs4 import BeautifulSoup
import io
import sys
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='gb') # 改變標(biāo)準(zhǔn)輸出的默認(rèn)編碼
query = input("【中證網(wǎng)】請輸入你想搜索的內(nèi)容:")
pages = int(input("要爬取的頁數(shù)(不小于1):"))
if pages < 1:
exit()
url = f'http://search.cs.com.cn/search?page=1&channelid=&searchword={query}'
dic = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 "
"Safari/537.36 SLBrowser/7.0.0.6241 SLBChan/30"}
resp = requests.get(url, headers=dic, )
resp.encoding = 'utf-8'
# print(resp)
# print(resp.text)
page = BeautifulSoup(resp.text, "html.parser") # 指定html解析器
alist = page.find("table").find_all("a")
# print(alist)
weblist = []
for a in alist:
if a.get('href')[:5] == "https":
weblist.append(a.get('href'))
# ----------------單頁每個文章---------------------------------
m = 0
for ii in range(len(weblist)):
url_a = weblist[ii]
# print('0=',url_a)
dic_a = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 "
"Safari/537.36 SLBrowser/7.0.0.6241 SLBChan/30"}
resp_a = requests.get(url_a, headers=dic_a, )
resp_a.encoding = 'gbk'
# print('New:\n',resp_a.text)
page_a = BeautifulSoup(resp_a.text, "html.parser") # 指定html解析器
# print('123:\n',page_a)
page_b = page_a.find('section').find_all('p')
# print(page_b)
fp=open(f'D:/桌面/爬蟲-銀行/中國證券網(wǎng)/中國銀行/0/(2021){ii+1}.txt','w+',encoding='utf-8')
txt_list = []
for txt_a in page_b:
# print(txt_a.text)
txt_list.append(txt_a.text)
# +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# ++++++++++++++++++++++文本寫入+++++++++++++++++++++++++++++++
for i in range(len(txt_list)):
fp.write(txt_list[i] + '\n') # 只包含文本
fp.close()
print(f'>>{ii+1}成功!')
m = ii+1
# +-+++-----------++++++++++-----多頁------++++++++++++----------++++
# +-+++-----------++++++++++-----多頁------++++++++++++----------++++
if pages > 1:
for p in range(pages):
url_s = f"http://search.cs.com.cn/search?page={p+1}&channelid=&searchword={query}"
resp = requests.get(url, headers=dic, )
resp.encoding = 'utf-8'
# print(resp)
# print(resp.text)
page = BeautifulSoup(resp.text, "html.parser") # 指定html解析器
alist = page.find("table").find_all("a")
# print(alist)
weblist = []
for a in alist:
if a.get('href')[:5] == "https":
weblist.append(a.get('href'))
# ----------------單頁每個文章---------------------------------
for ii in range(len(weblist)):
url_a = weblist[ii]
# print('0=',url_a)
dic_a = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 "
"Safari/537.36 SLBrowser/7.0.0.6241 SLBChan/30"}
resp_a = requests.get(url_a, headers=dic_a, )
resp_a.encoding = 'gbk'
# print('New:\n',resp_a.text)
page_a = BeautifulSoup(resp_a.text, "html.parser") # 指定html解析器
# print('123:\n',page_a)
page_b = page_a.find('section').find_all('p')
# print(page_b)
fp = open(f'D:/桌面/爬蟲-銀行/中國證券網(wǎng)/中國銀行/0/(2021){ii + 1 + m}.txt', 'w+', encoding='utf-8')
txt_list = []
for txt_a in page_b:
# print(txt_a.text)
txt_list.append(txt_a.text)
# +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# ++++++++++++++++++++++文本寫入+++++++++++++++++++++++++++++++
for i in range(len(txt_list)):
fp.write(txt_list[i] + '\n') # 只包含文本
print(f'>>{ii + 1 + m}成功!')
m = m + ii + 1
fp.close()
print('---------------\n>>>爬取完畢<<<')
# coding=utf-8
import requests
from bs4 import BeautifulSoup
import io
import sys
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='gb') # 改變標(biāo)準(zhǔn)輸出的默認(rèn)編碼
query = input("【中證網(wǎng)】請輸入你想搜索的內(nèi)容:")
pages = int(input("要爬取的頁數(shù)(不小于1):"))
if pages < 1:
exit()
m = 0
for p in range(1,pages+1):
url = f'http://search.cs.com.cn/search?page={p}&channelid=&searchword={query}&perpage=10&outlinepage=5&&andsen=&total=&orsen=&exclude=&searchscope=×cope=×copecolumn=&orderby=&timeline==2021'
dic = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36"}
resp = requests.get(url, headers=dic, )
resp.encoding = 'utf-8'
# print(resp)
print(f'\n>>>--------------------第{p}頁---------------------<<<\n')
print(f'\n>>>--------------------第{p}頁---------------------<<<\n')
print(f'\n>>>--------------------第{p}頁---------------------<<<\n')
# print(resp.text)
page = BeautifulSoup(resp.text, "html.parser") # 指定html解析器
alist = page.find("table").find_all('a')
weblist = []
for a in alist:
if a.get('href')[:5] == "https":
weblist.append(a.get('href'))
# print('weblist==',weblist)
# ----------------單頁每個文章---------------------------------
for ii in range(len(weblist)):
url_a = weblist[ii]
# print('0=',url_a)
dic_a = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36"}
resp_a = requests.get(url_a, headers=dic_a, )
resp_a.encoding = 'gbk'
# print('New:\n',resp_a.text)
page_a = BeautifulSoup(resp_a.text, "html.parser") # 指定html解析器
# print('123:\n',page_a)
page_b = page_a.find('section').find_all('p')
# print(page_b)
fp=open(f'D:/桌面/爬蟲-銀行/中國證券網(wǎng)/中國銀行/2021/(2021){ii+m+1}.txt','w+',encoding='utf-8')
txt_list = []
for txt_a in page_b:
# print('txt_a===',txt_a.text)
txt_list.append(txt_a.text)
print(f'\n-++++++++++++++++++第{ii+1}篇文章++++++++++++++++-\n',txt_list,len(txt_list))
# +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# ++++++++++++++++++++++文本寫入+++++++++++++++++++++++++++++++
for i in range(len(txt_list)):
fp.write(txt_list[i] + '\n') # 只包含文本
# print('-----------------------------------')
print(f'\n> > >{ii+1}成功! < < <')
fp.close()
m=m+len(weblist)+1
print('---------------\n>>>爬取完畢<<<')
# coding=utf-8
import requests
from bs4 import BeautifulSoup
import io
import sys
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='gb') # 改變標(biāo)準(zhǔn)輸出的默認(rèn)編碼
query = input("【中證網(wǎng)】請輸入你想搜索的內(nèi)容:")
pages = int(input("要爬取的頁數(shù)(不小于1):"))
if pages < 1:
exit()
m = 0
for p in range(1,pages+1):
url = f'http://search.cs.com.cn/search?page={pages}&channelid=&searchword={query}&keyword={query}&token=12..47&perpage=10&outlinepage=5&&andsen=&total=&orsen=&exclude=&searchscope=×cope=×copecolumn=&orderby=&timeline==2020'
dic = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36"}
resp = requests.get(url, headers=dic, )
resp.encoding = 'utf-8'
# print(resp)
print(f'\n>>>--------------------第{p}頁---------------------<<<\n')
print(f'\n>>>--------------------第{p}頁---------------------<<<\n')
print(f'\n>>>--------------------第{p}頁---------------------<<<\n')
# print(resp.text)
page = BeautifulSoup(resp.text, "html.parser") # 指定html解析器
alist = page.find("table").find_all('a')
print('alist:',alist)
weblist = []
for a in alist:
if a.get('href')[4:] == "http":
weblist.append(a.get('href'))
print('weblist==',weblist)
# ----------------單頁每個文章---------------------------------
for ii in range(len(weblist)):
url_a = weblist[ii]
# print('0=',url_a)
dic_a = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36"}
resp_a = requests.get(url_a, headers=dic_a, )
resp_a.encoding = 'gbk'
# print('New:\n',resp_a.text)
page_a = BeautifulSoup(resp_a.text, "html.parser") # 指定html解析器
# print('123:\n',page_a)
page_b = page_a.find('section').find_all('p')
# print(page_b)
fp=open(f'D:/桌面/爬蟲-銀行/中國證券網(wǎng)/中國銀行/2020/(2020){ii+m+1}.txt','w+',encoding='utf-8')
txt_list = []
for txt_a in page_b:
# print('txt_a===',txt_a.text)
txt_list.append(txt_a.text)
print(f'\n-++++++++++++++++++第{ii+1}篇文章++++++++++++++++-\n',txt_list,len(txt_list))
# +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# ++++++++++++++++++++++文本寫入+++++++++++++++++++++++++++++++
for i in range(len(txt_list)):
fp.write(txt_list[i] + '\n') # 只包含文本
# print('-----------------------------------')
print(f'\n> > >{ii+1}成功! < < <')
fp.close()
m=m+len(weblist)+1
print('---------------\n>>>爬取完畢<<<')
# coding=utf-8
import requests
from bs4 import BeautifulSoup
import io
import sys
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='gb') # 改變標(biāo)準(zhǔn)輸出的默認(rèn)編碼
query = input("【中證網(wǎng)】請輸入你想搜索的內(nèi)容:")
year = int(input('要爬取的年份:'))
pages = int(input("要爬取的頁數(shù)(不小于1):"))
if pages < 1:
exit()
m = 0
for p in range(1, pages + 1):
url = f'http://search.cs.com.cn/search?page={p}&channelid=&searchword={query}&keyword={query}&token=12..47&perpage=10&outlinepage=5&&andsen=&total=&orsen=&exclude=&searchscope=×cope=×copecolumn=&orderby=&timeline=={year}'
dic = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36"}
resp = requests.get(url, headers=dic, )
resp.encoding = 'utf-8'
# print(resp)
print(f'\n>>>--------------------第{p}頁---------------------<<<\n')
print(f'\n>>>--------------------第{p}頁---------------------<<<\n')
print(f'\n>>>--------------------第{p}頁---------------------<<<\n')
# print(resp.text)
page = BeautifulSoup(resp.text, "html.parser") # 指定html解析器
alist = page.find("table").find('tr').find_all('a')
# print('alist:', alist)
weblist = []
for a in alist:
if a.get('href')[:4] == "http":
weblist.append(a.get('href'))
print('weblist==', weblist)
# ----------------單頁每個文章---------------------------------
for ii in range(len(weblist)):
url_a = weblist[ii]
# print('0=',url_a)
dic_a = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36"}
resp_a = requests.get(url_a, headers=dic_a, )
resp_a.encoding = 'gbk'
# print('New:\n',resp_a.text)
page_a = BeautifulSoup(resp_a.text, "html.parser") # 指定html解析器
# print('123:\n',page_a)
page_b = page_a.find_all('p')
# print(page_b)
fp = open(f'D:/桌面/爬蟲-銀行/中國證券網(wǎng)/中國銀行/{year}/({year}){ii + m + 1}.txt', 'w+', encoding='utf-8')
txt_list = []
for txt_a in page_b:
# print('txt_a===',txt_a.text)
txt_list.append(txt_a.text)
print(f'\n-++++++++++++++++++第{ii + 1}篇文章++++++++++++++++-\n', txt_list, len(txt_list))
# +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# ++++++++++++++++++++++文本寫入+++++++++++++++++++++++++++++++
for i in range(len(txt_list)):
fp.write(txt_list[i] + '\n') # 只包含文本
# print('-----------------------------------')
print(f'\n> > >{ii + 1}成功! < < <')
fp.close()
m = m + len(weblist) + 1
print('---------------\n>>>爬取完畢<<<')
# coding=utf-8
import requests
from bs4 import BeautifulSoup
import io
import sys
import os
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='gb') # 改變標(biāo)準(zhǔn)輸出的默認(rèn)編碼
# query = input("【中證網(wǎng)】請輸入你想搜索的內(nèi)容:")
query = '交通銀行'
year = int(input('要爬取的年份:'))
pages = int(input("要爬取的頁數(shù)(不小于1):"))
if pages < 1:
exit()
m = 0
for p in range(1, pages + 1):
url = f'http://search.cs.com.cn/search?page={p}&channelid=&searchword={query}&keyword={query}&token=12..47&perpage=10&outlinepage=5&&andsen=&total=&orsen=&exclude=&searchscope=×cope=×copecolumn=&orderby=&timeline=={year}'
dic = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36"}
resp = requests.get(url, headers=dic, )
resp.encoding = 'utf-8'
# print(resp)
print(f'\n>>>--------------------第{p}頁---------------------<<<\n')
print(f'\n>>>--------------------第{p}頁---------------------<<<\n')
print(f'\n>>>--------------------第{p}頁---------------------<<<\n')
# print(resp.text)
page = BeautifulSoup(resp.text, "html.parser") # 指定html解析器
alist = page.find_all("table")
datalist = []
for ii in alist:
ss=ii.find('td', style='font-size: 12px;line-height: 24px;color: #;margin-top: 4px;')
# print('ss=\n\n',ss)
if ss != None:
ss = ss.get_text()
datalist.append(ss)
# print('data:',datalist,len(datalist))
if not os.path.isdir(f'D:/桌面/爬蟲-銀行/中國證券網(wǎng)/{query}/{year}'): # 如果沒有此文件夾
os.mkdir(f'D:/桌面/爬蟲-銀行/中國證券網(wǎng)/{query}/{year}') # 創(chuàng)建此文件夾
for ii in range(len(datalist)):
fp = open(f'D:/桌面/爬蟲-銀行/中國證券網(wǎng)/{query}/{year}/({year}){ii + m + 1}.txt', 'w+', encoding='utf-8')
fp.write(datalist[ii] + '\n') # 只包含文本
print(datalist[ii])
print(f'\n> > >第{p}頁,第{ii + 1}篇,成功! < < <')
fp.close()
m = m + len(datalist) + 1
print('----------------------------')
print(f'------\n{year}年,爬取完畢----')
print('----------------------------')
目前成都創(chuàng)新互聯(lián)公司已為1000多家的企業(yè)提供了網(wǎng)站建設(shè)、域名、虛擬空間、網(wǎng)站托管、企業(yè)網(wǎng)站設(shè)計、松北網(wǎng)站維護(hù)等服務(wù),公司將堅持客戶導(dǎo)向、應(yīng)用為本的策略,正道將秉承"和諧、參與、激情"的文化,與客戶和合作伙伴齊心協(xié)力一起成長,共同發(fā)展。
轉(zhuǎn)載請注明出處,謝謝?。。?/span>