def statistics(astr):
在沙洋等地區(qū),都構(gòu)建了全面的區(qū)域性戰(zhàn)略布局,加強發(fā)展的系統(tǒng)性、市場前瞻性、產(chǎn)品創(chuàng)新能力,以專注、極致的服務理念,為客戶提供成都網(wǎng)站設計、網(wǎng)站建設 網(wǎng)站設計制作定制網(wǎng)站,公司網(wǎng)站建設,企業(yè)網(wǎng)站建設,品牌網(wǎng)站制作,成都全網(wǎng)營銷推廣,成都外貿(mào)網(wǎng)站制作,沙洋網(wǎng)站建設費用合理。
# astr.replace("\n", "")
slist = list(astr.split("\t"))
alist = []
[alist.append(i) for i in slist if i not in alist]
alist[-1] = alist[-1].replace("\n", "")
return alist
if __name__ == "__main__":
code_doc = {}
with open("test_data.txt", "r", encoding='utf-8') as fs:
for ln in fs.readlines():
l = statistics(ln)
for t in l:
if t not in code_doc:
code_doc.setdefault(t, 1)
else:
code_doc[t] += 1
for keys in code_doc.keys():
print(keys + ' ' + str(code_doc[keys]))
簡單版:
#!/usr/bin/env?python3
import?re
import?jieba
from?collections?import?Counter
fname?=?'counttest.txt'
with?open(fname)?as?f:
s?=?f.read()
pattern?=?re.compile(r'[a-zA-Z]+\-?[a-zA-Z]*')
english_words?=?Counter(pattern.findall(s))
other_words?=?Counter(jieba.cut(pattern.sub('',?s)))
print('\n英文單詞統(tǒng)計結(jié)果:\n'+'-'*17)
print('\n'.join(['{}:?{}'.format(i,?j)?for?i,?j?in?english_words.most_common()]))
print('\n中文及符號統(tǒng)計結(jié)果:\n'+'-'*19)
print('\n'.join(['{}:?{}'.format(i,?j)?for?i,?j?in?other_words.most_common()]))
復雜版:
#!/usr/bin/env?python
#?-*-?coding:?utf-8?-*-
from?__future__?import?print_function,?division,?unicode_literals
import?sys,?re,?time,?os,?jieba
from?collections?import?Counter
from?datetime?import?datetime
class?WordCounter(object):
def?__init__(self,?from_file,?to_file=None,?coding=None,?jieba_cut=None):
'''根據(jù)設定的進程數(shù),把文件from_file分割成大小基本相同,數(shù)量等同與進程數(shù)的文件段,
來讀取并統(tǒng)計詞頻,然后把結(jié)果寫入to_file中,當其為None時直接打印在終端或命令行上。
Args:
@from_file?要讀取的文件
@to_file?結(jié)果要寫入的文件
@coding?文件的編碼方式,默認為采用chardet模塊讀取前1萬個字符來自動判斷
@jieba_cut?是否啟用結(jié)巴分詞,默認為None
How?to?use:
w?=?WordCounter('a.txt',?'b.txt')
w.run()????????
'''
if?not?os.path.isfile(from_file):
raise?Exception('No?such?file:?文件不存在')
self.f1?=?from_file
self.filesize?=?os.path.getsize(from_file)
self.f2?=?to_file
if?coding?is?None:
try:
import?chardet
except?ImportError:
os.system('pip?install?chardet')
print('-'*70)
import?chardet
with?open(from_file,?'rb')?as?f:????
coding?=?chardet.detect(f.read(10000))['encoding']????????????
self.coding?=?coding
self._c?=?[Counter(),?Counter()]
self.jieba?=?False
if?jieba_cut?is?not?None:??????????????????
self.jieba?=?True
def?run(self):
start?=?time.time()
if?1:
self.count_direct(self.f1)??????????
if?self.f2?not?in?['None',?'Null',?'none',?'null',?None]:
with?open(self.f2,?'wb')?as?f:
f.write(self.result.encode(self.coding))
else:
print('\nEnglish?words:\n'?+?'-'*15)
print(self.result)
cost?=?'{:.1f}'.format(time.time()-start)
size?=?humansize(self.filesize)
tip?=?'\nFile?size:?{}.?Cost?time:?{}?seconds'?????
#????????print(tip.format(size,?cost))
self.cost?=?cost?+?'s'
def?count_direct(self,?from_file):
'''直接把文件內(nèi)容全部讀進內(nèi)存并統(tǒng)計詞頻'''
start?=?time.time()
with?open(from_file,?'rb')?as?f:
line?=?f.read()
for?i?in?range(len(self._c)):
self._c[i].update(self.parse(line)[i])??
def?parse(self,?line):??#解析讀取的文件流
text?=?line.decode(self.coding)
text?=?re.sub(r'\-\n',?'',?text)?#考慮同一個單詞被分割成兩段的情況,刪除行末的-號
pattern?=?re.compile(r'[a-zA-Z]+\-?[a-zA-Z]*')?#判斷是否為英文單詞
english_words?=?pattern.findall(text)
rest?=?pattern.sub('',?text)????????
ex?=?Counter(jieba.cut(rest))?if?self.jieba?else?Counter(text)
return?Counter(english_words),?ex
def?flush(self):??#清空統(tǒng)計結(jié)果
self._c?=?[Counter(),?Counter()]
@property
def?counter(self):??#返回統(tǒng)計結(jié)果的Counter類???????
return?self._c
@property
def?result(self):??#返回統(tǒng)計結(jié)果的字符串型式,等同于要寫入結(jié)果文件的內(nèi)容
ss?=?[]
for?c?in?self._c:
ss.append(['{}:?{}'.format(i,?j)?for?i,?j?in?c.most_common()])
tip?=?'\n\n中文及符號統(tǒng)計結(jié)果:\n'+'-'*15+'\n'
return?tip.join(['\n'.join(s)?for?s?in?ss])
def?humansize(size):
"""將文件的大小轉(zhuǎn)成帶單位的形式
humansize(1024)?==?'1?KB'
True
humansize(1000)?==?'1000?B'
True
humansize(1024*1024)?==?'1?M'
True
humansize(1024*1024*1024*2)?==?'2?G'
True
"""
units?=?['B',?'KB',?'M',?'G',?'T']????
for?unit?in?units:
if?size??1024:
break
size?=?size?//?1024
return?'{}?{}'.format(size,?unit)
def?main():
if?len(sys.argv)??2:
print('Usage:?python?wordcounter.py?from_file?to_file')
exit(1)
from_file,?to_file?=?sys.argv[1:3]
args?=?{'coding'?:?None,?'jieba_cut':?1}
for?i?in?sys.argv:
for?k?in?args:
if?re.search(r'{}=(.+)'.format(k),?i):
args[k]?=?re.findall(r'{}=(.+)'.format(k),?i)[0]
w?=?WordCounter(from_file,?to_file,?**args)
w.run()
if?__name__?==?'__main__':
import?doctest
doctest.testmod()
main()
更復雜的:如果是比較大的文件,建議采用多進程,詳情百度:多進程讀取大文件并統(tǒng)計詞頻 jaket5219999
1、全局變量在函數(shù)中使用時需要加入global聲明
2、獲取網(wǎng)頁內(nèi)容存入文件時的編碼為ascii進行正則匹配時需要decode為GB2312,當匹配到的中文寫入文件時需要encode成GB2312寫入文件。
3、中文字符匹配過濾正則表達式為ur'[\u4e00-\u9fa5]+',使用findall找到所有的中文字符存入分組
4、KEY,Value值可以使用dict存儲,排序后可以使用list存儲
5、字符串處理使用split分割,然后使用index截取字符串,判斷哪些是名詞和動詞
6、命令行使用需要導入os,os.system(cmd)
#!?python3
#?-*-?coding:?utf-8?-*-
import?os,?codecs
import?jieba
from?collections?import?Counter
def?get_words(txt):
seg_list?=?jieba.cut(txt)
c?=?Counter()
for?x?in?seg_list:
if?len(x)1?and?x?!=?'\r\n':
c[x]?+=?1
print('常用詞頻度統(tǒng)計結(jié)果')
for?(k,v)?in?c.most_common(100):
print('%s%s?%s??%d'?%?('??'*(5-len(k)),?k,?'*'*int(v/3),?v))
if?__name__?==?'__main__':
with?codecs.open('19d.txt',?'r',?'utf8')?as?f:
txt?=?f.read()
get_words(txt)