网站建设 >

查看其它板块

Python函数统计词频,利用python进行词频统计

一个txt文档，已经用结巴分词分完词，怎么用python工具对这个分完词的文档进行计算统计词频，求脚本，非

#!/usr/bin/env python3

创新互联建站服务项目包括安平网站建设、安平网站制作、安平网页制作以及安平网络营销策划等。多年来，我们专注于互联网行业，利用自身积累的技术优势、行业经验、深度合作伙伴关系等，向广大中小型企业、政府机构等提供互联网行业的解决方案，安平网站推广取得了明显的社会效益与经济效益。目前，我们服务的客户以成都为中心已经辐射到安平省份的部分城市，未来相信会继续扩大服务区域并继续获得客户的支持与信任！

#-*- coding:utf-8 -*-

import os,random

#假设要读取文件名为aa，位于当前路径

filename='aa.txt'

dirname=os.getcwd()

f_n=os.path.join(dirname,filename)

#注释掉的程序段，用于测试脚本，它生成20行数据，每行有1-20随机个数字，每个数字随机1-20

'''

test=''

for i in range(20):

for j in range(random.randint(1,20)):

test+=str(random.randint(1,20))+' '

test+='\n'

with open(f_n,'w') as wf:

wf.write(test)

'''

with open(f_n) as f:

s=f.readlines()

#将每一行数据去掉首尾的空格和换行符，然后用空格分割，再组成一维列表

words=[]

for line in s:

words.extend(line.strip().split(' '))

#格式化要输出的每行数据，首尾各占8位，中间占18位

def geshi(a,b,c):

return alignment(str(a))+alignment(str(b),18)+alignment(str(c))+'\n'

#中英文混合对齐，参考，二楼

#汉字与字母格式化占位 format对齐出错对不齐汉字对齐数字汉字对齐字母中文对齐英文

#alignment函数用于英汉混合对齐、汉字英文对齐、汉英对齐、中英对齐

def alignment(str1, space=8, align = 'left'):

length = len(str1.encode('gb2312'))

space = space - length if space =length else 0

if align in ['left','l','L','Left','LEFT']:

str1 = str1 + ' ' * space

elif align in ['right','r','R','Right','RIGHT']:

str1 = ' '* space +str1

elif align in ['center','c','C','Center','CENTER','centre']:

str1 = ' ' * (space //2) +str1 + ' '* (space - space // 2)

return str1

w_s=geshi('序号','词','频率')

#由(词,频率)元组构成列表，先按频率降序排序，再按词升序排序，多级排序，一组升，一组降，高级sorted

wordcount=sorted([(w,words.count(w)) for w in set(words)],key=lambda l:(-l[1],l[0]))

#要输出的数据，每一行由：序号(占8位)词(占20位)频率(占8位)+'\n'构成，序号=List.index(element)+1

for (w,c) in wordcount:

w_s+=geshi(wordcount.index((w,c))+1,w,c)

#将统计结果写入文件ar.txt中

writefile='ar.txt'

w_n=os.path.join(dirname,writefile)

with open(w_n,'w') as wf:

wf.write(w_s)

如何用python实现英文短文的双词频统计？

import re

from itertools import imap as map

from collections import Counter

def parserwords(sentence):

preword = ''

result = []

for word in re.findall('\w+', sentence.lower()):

if preword:

result.append((preword, word))

preword = word

return result

context = """

Do you hear the people sing, singing a song of angry men.

It is the music of a people, who will not be slaves again,

when the beating of your heart echoes the beating of the drums.

There is a life about to start when tomorrow comes.

"""

words = []

for sentence in map(parserwords,

re.split(r'[,.]', context.lower())):

words.extend(sentence)

prefixcounter = Counter([word[0] for word in words])

counter = Counter(words)

meter = {}

for pre, post in counter.iterkeys():

meter[(pre, post)] = 1. * counter[(pre, post)] / prefixcounter[pre]

result = sorted(meter.iteritems(),

cmp = lambda a, b: cmp(b[1], a[1]) or cmp(a[0], b[0])

)

print result[:5]

如何用python对文章中文分词并统计词频

使用结巴分词，统计频率可以使用Counter，即from collections import Counter

用Python统计词频

def statistics(astr):

# astr.replace("\n", "")

slist = list(astr.split("\t"))

alist = []

[alist.append(i) for i in slist if i not in alist]

alist[-1] = alist[-1].replace("\n", "")

return alist

if __name__ == "__main__":

code_doc = {}

with open("test_data.txt", "r", encoding='utf-8') as fs:

for ln in fs.readlines():

l = statistics(ln)

for t in l:

if t not in code_doc:

code_doc.setdefault(t, 1)

else:

code_doc[t] += 1

for keys in code_doc.keys():

print(keys + ' ' + str(code_doc[keys]))

Python词频统计问题

#下载一文到words.txt，内容为（stu ml ds ml stu stuee zkz wxj Zkz Wxj）

File = "words.txt"

number_list =[]

with open(File) as f:

for line in f:

number_list.extend( str(i) for i in line.split())

for item in set(number_list):

L=[item,number_list.index(item),number_list.count(item)]

print(L) #单词首次出现的位置词频

with open('Q1.txt','a') as F:

F.writelines(str(L))

如何用python和jieba分词，统计词频？

#! python3

# -*- coding: utf-8 -*-

import os, codecs

import jieba

from collections import Counter

def get_words(txt):

seg_list = jieba.cut(txt)

c = Counter()

for x in seg_list:

if len(x)1 and x != '\r\n':

c[x] += 1

print('常用词频度统计结果')

for (k,v) in c.most_common(100):

print('%s%s %s %d' % (' '*(5-len(k)), k, '*'*int(v/3), v))

if __name__ == '__main__':

with codecs.open('19d.txt', 'r', 'utf8') as f:

txt = f.read()

get_words(txt)

分享题目：Python函数统计词频,利用python进行词频统计
本文地址：http://cdkjz.cn/article/hcgges.html

返回首页了解更多建站资讯

多年建站经验

多一份参考，总有益处

联系快上网，免费获得专属《策划方案》及报价

咨询相关问题或预约面谈，可以通过以下方式与我们联系

大客户专线成都：13518219792 座机：028-86922220

在线咨询提交需求

友情链接交换友情链接

小谭网创广告成都婚庆策划梅州网站建设成都做网站响应式报价眉山网站建设商城系统开发成都网站建设报价四川电信机房托管迷你发光字

成都网站建设公司地址：成都市青羊区太升南路288号锦天国际A座10层建设咨询028-86922220

成都快上网科技有限公司-四川网站建设设计公司 | 蜀ICP备19037934号 Copyright 2020,ALL Rights Reserved cdkjz.cn | 成都网站建设 | © Copyright 2020版权所有.

专家团队为您提供成都网站建设,成都网站设计,成都品牌网站设计,成都营销型网站制作等服务,成都建网站就找快上网！ | 成都网站建设哪家好？ | 网站建设地图