首页 >> 大全

Python数据类型结构

2023-10-10 大全 32 作者:考证青年

对于CSDN中的代码,选择“复制代码”,然后在CMD中打开,选择Ctrl + V,即可直接运行查看结果。

程序,处理的全部是数据.

数据类型结构 字符串

设定映射关系进行转 换。

python">transTab=str.maketrans("0123456789","零壹贰叁肆伍陆柒捌玖")
print("879".translate(transTab))

行切分,形成行的list。将字符串从分行位置 处切分形成list。分行位置包括但不限于: \n、\r\n、\v、\f等

print("蒹葭苍苍,\n白露为霜。\n所谓伊人,\n在水一方。".splitlines())

切分strO所代表的字符串

print("大江东去,浪淘尽,千古风流人物".partition(","))

列表 列表中的切片

列表最重要的是切片。下面这段代码,是提取电脑文件夹中的xls文件名称,并将其整齐化。

写如下代码:


import os
import openpyxlos.chdir("E:\scoretest")
subList=os.listdir(os.getcwd())for isen in subList:#先从文件中取中学的名字。name=isen.split("中")aSen=name[0]bSen=aSen+"中"if bSen[-3:]=="高级中":school=bSen+"学"elif bSen[-3:]=="高中学":school=bSen[:-1]elif bSen[-3]=="第":school=bSen+"学"else:school=bSenprint(school)

这里运用到了列表中的切片。[ ]是 左闭右开。举例如下:

mylist=[1,3,4,5]
print(mylist[0:2])#[1, 3]

通过列表生成ID号码

myList=["002020072"+str(i).rjust(3,"0") for i in range(1,23)]
for i in myList:print(i)

元组

元组不可改。

符号表示:( )

#一定搞清楚,元组不可改,在列表中的数个元组,改的是列表中的元素。
#字典,couple,都是新的英文单词而已,知道存在的基本原理和道理。#把python得到的成绩txt文本导入到Excel中,可以做表格。很方便和神奇啊。
list1=[('楔', 0), ('子', 2967), ('张', 1455), ('天', 1954), ('师', 1132), ('祈', 19), ('禳', 0)]for i in list1:print("{},{}".format(i[0],i[1]),end=" ")

集合

符号表示:{ } 。空的 集合用 ()。

集合的基本操作

#set用于高速的增删改查。但是相应的,他的空间就需要大一些。

set()#可以得到汉字的集合。myStr="红军不怕远征难,万水千山只等闲。"
#集合最重要的特征就是没有重复。他会变成一个一个的。
mySet=set(myStr)
print(mySet)
yourSet={"红军不怕远征难,万水千山只等闲。"}
yourSet.add("apple") #集合中用add而不是用append
#set不保证你的顺序。他的最重要特点就是不保证序,查找的速度相当快。list量大的时候,查找的速度比较慢。
set=()#空的集合用括号,而不是花括号。单独的花括号有别的用途。
heLike={'ro','la',"so"}
#set用于高速的增删改查。但是相应的,他的空间就需要大一些。集合函数的掌握得多多实践。差集,交集,子集等需要实现。

#集合是无序不重复的元素的集合。

#集合的元素没法更改。只有改成list才能够改。

#取的键的集合
#注意,虽然都用了花括号,字典和集合不同。字典有映射关系。
#集合是无序不重复的元素的集合。
set1={('汪', 3), ('盛', 155), ('驸', 40), }
set2={('烦', 166), ('磷', 1), ('宁', 183), ('蛟', 1), ('欹', 2), ('钝', 3), ('厦', 22), ('鳏', 2), ('方', 1132), ('尸', 43), ('哀', 37)}#集合的元素没法更改。只有改成list才能够改。
list1=list(set1)
for i in list1:i=list(i)
print(list1)##??此处行不通,不能将集合中的元素(  )转为列表[ ]
print("OK")
a=('烦', 166)
set1.add(a)
print(set1)#添加,只能是单个元素。a=[('茅', 15), ('狼', 115), ('服', 220), ('拚', 24), ('族', 13)]
set1.update(a)
print(set1)#添加,update多个元素b=('宁', 183)
set2.remove(b)#删除
print(set2)print(set1,set2)#单纯罗列两个集合print(set1|set2)#求并集
print(set1.union(set2))print(set2&set1)#求交集
print(set1.intersection(set2))print(set1-set2)#求差集''''....(后面部分。名著-词频统计)Set1 = set(list1)#把列表变成了集合。#print(Set1)return Set1,dic1,list1#return返回值是和外界沟通的,调用函数后,再定义一个值把函数中的值接着。同时return可以返回多个函数值。data=TXT(txt1)##!!!!    !!!
set1=data[0]
dic1=data[1]
list1=data[2]data=TXT(txt2)
set2=data[0]
dic2=data[1]
list2=data[2]#inter=charSet1.intersection(charSet2)
#union=charSet1.union(charSet2)
#字相等,出现次数不一致,在交集中仍然存在。所以此路不同。
#set1zero=charSet1-inter#某个字在其中一部小说出现,但在另外一部小说不出现,没有出现的小说记为 0得到的集合。
#set2zero=charSet2-inter
#print(len(set1zero))listZero=[]
for i in set1zero:listZero.append(i[0])
print(listZero)#print("并集",union)
'''
#myFile.close()

集合是无序的

list1=[i for i in range(10)]+[i for i in range(10)]
print("My list is :",list1)
MySet=set(list1)
print("My set is:",MySet)
MyTuple=tuple(list1)
print("My tuple is:",MyTuple)
HisSet=MySet.pop()
YourSet=MySet.clear()
print("Your Set is:",YourSet)
print("His Set is:",HisSet)

字典 字典的基本操作

#字典应用示例
dictA={"Apple":"苹果","Computer":"计算机","Mouse":"鼠标","Mobile":"手机"}
print(list(dictA)) #将字典的键转换为list类型
print(list(dictA.items())) #将字典的键和值组装成tuple,然后转换为list
print(list(dictA.keys())) #将字典的键转换为list类型
print(list(dictA.values())) #将字典的值转换为list类型#engItem对应字典的键,chnItem对应字典的值
#相当于一个tuple对应字典items()函数的一个个结果
for engItem,chnItem in dictA.items():print(engItem,"-->",chnItem)wFile=open("..\ABC.txt","w",encoding="utf-8")
print(dictA,file=wFile)
wFile.close()

两个字典的相加

#两个字典的相加
dicA={'a':1,'b':2,'c':3,'d':4,'f':"hello"}
dicB={'b':3,'d':5,'e':7,'m':9,'k':'world'}
def addDic(dicA,dicB):newDic={}for key in dicA:if dicB.get(key):newDic[key]=dicA[key]+dicB[key]else:newDic[key]=dicA[key]for key in dicB:if dicA.get(key):passelse:newDic[key]=dicB[key]return newDic                
print(addDic(dicA,dicB))'''
print(dic.items())#会把这些值弄成元组。
print(dic.keys)
print(dic.values)print(list(dic.items()))#会把这些值弄成元组。
print(list(dic.keys))
print(list(dic.values))#用item 的方法更加直观一些。#找到年龄最大的人,并输出。请找出程序中有什么问题。
person = {"李先生":18,"王先生":50,"张先生":20,"孙先生":22}
for key in person.keys():print(key)for value in person.values():print(value)for item in person.items():print(item)for i in person:print(i)m='李先生'
for key in person.keys():if person[m]<=person[key]:m=key# 把大的值赋给了m
print("最大的人是{},年龄是{}。".format(m,person[m]))字典的输出通常也转化为列表。因为字典太特殊了。d =  {123:"123", 456:"456", 789:"789"}
key = []
value = []
for i in d.items():key.append(i[0])value.append(i[1])
print(key)
print(value)
'''mydic={'地铁6号线': [('23', [4449, 4415, 8864]), ('25', [4364, 4350, 8714]), ('33', [4113, 4145, 8258]), ('29', [3786, 3990, 7776]), ('41', [3856, 3835, 7691]), ('27', [3722, 3920, 7642]), ('31', [3536, 3453, 6989]), ('35', [3477, 3468, 6945]), ('37', [3346, 3469, 6815]), ('39', [3305, 3332, 6637]), ('21', [3347, 3148, 6495]), ('43', [2819, 2794, 5613]), ('3', [2726, 2716, 5442]), ('13', [2681, 2661, 5342]), ('59', [2458, 2387, 4845]), ('61', [2426, 2315, 4741]), ('51', [2237, 2222, 4459]), ('55', [2134, 2227, 4361]), ('45', [2174, 2145, 4319]), ('5', [2246, 2035, 4281]), ('49', [2112, 2134, 4246]), ('53', [2004, 2178, 4182]), ('14', [1811, 1925, 3736]), ('15', [1909, 1773, 3682]), ('63', [1869, 1746, 3615]), ('9', [1796, 1674, 3470]), ('7', [1735, 1643, 3378]), ('65', [1702, 1586, 3288]), ('47', [1688, 1594, 3282]), ('18', [1583, 1566, 3149]), ('57', [1480, 1539, 3019]), ('11', [1442, 1461, 2903]), ('6', [1535, 1347, 2882]), ('8', [1307, 1357, 2664]), ('12', [1239, 1409, 2648]), ('1', [1193, 1187, 2380]), ('19', [1100, 1124, 2224]), ('67', [957, 909, 1866]), ('22', [837, 933, 1770]), ('17', [833, 859, 1692]), ('20', [789, 856, 1645]), ('4', [789, 795, 1584]), ('16', [641, 620, 1261]), ('10', [607, 636, 1243]), ('24', [568, 641, 1209]), ('79', [544, 628, 1172]), ('69', [402, 372, 774]), ('2', [295, 373, 668]), ('83', [299, 338, 637]), ('77', [280, 286, 566]), ('85', [262, 294, 556]), ('89', [271, 262, 533]), ('75', [230, 239, 469]), ('91', [207, 216, 423]), ('73', [202, 193, 395]), ('87', [133, 126, 259]), ('71', [117, 133, 250]), ('93', [29, 20, 49])], '地铁7号线': [('23', [4449, 4415, 8864]), ('25', [4364, 4350, 8714]), ('33', [4113, 4145, 8258]), ('29', [3786, 3990, 7776]), ('41', [3856, 3835, 7691]), ('27', [3722, 3920, 7642]), ('31', [3536, 3453, 6989]), ('35', [3477, 3468, 6945]), ('37', [3346, 3469, 6815]), ('39', [3305, 3332, 6637]), ('21', [3347, 3148, 6495]), ('43', [2819, 2794, 5613]), ('3', [2726, 2716, 5442]), ('13', [2681, 2661, 5342]), ('59', [2458, 2387, 4845]), ('61', [2426, 2315, 4741]), ('51', [2237, 2222, 4459]), ('55', [2134, 2227, 4361]), ('45', [2174, 2145, 4319]), ('5', [2246, 2035, 4281]), ('49', [2112, 2134, 4246]), ('53', [2004, 2178, 4182]), ('14', [1811, 1925, 3736]), ('15', [1909, 1773, 3682]), ('63', [1869, 1746, 3615]), ('9', [1796, 1674, 3470]), ('7', [1735, 1643, 3378]), ('65', [1702, 1586, 3288]), ('47', [1688, 1594, 3282]), ('18', [1583, 1566, 3149]), ('57', [1480, 1539, 3019]), ('11', [1442, 1461, 2903]), ('6', [1535, 1347, 2882]), ('8', [1307, 1357, 2664]), ('12', [1239, 1409, 2648]), ('1', [1193, 1187, 2380]), ('19', [1100, 1124, 2224]), ('67', [957, 909, 1866]), ('22', [837, 933, 1770]), ('17', [833, 859, 1692]), ('20', [789, 856, 1645]), ('4', [789, 795, 1584]), ('16', [641, 620, 1261]), ('10', [607, 636, 1243]), ('24', [568, 641, 1209]), ('79', [544, 628, 1172]), ('69', [402, 372, 774]), ('2', [295, 373, 668]), ('83', [299, 338, 637]), ('77', [280, 286, 566]), ('85', [262, 294, 556]), ('89', [271, 262, 533]), ('75', [230, 239, 469]), ('91', [207, 216, 423]), ('73', [202, 193, 395]), ('87', [133, 126, 259]), ('71', [117, 133, 250]), ('93', [29, 20, 49])]}
#print(mydic)
alllist=[]
valuelist=[]for i in mydic.items():valuelist.append(i[1])alllist.append(i)
for i in keylist:print(i)for i in valuelist:#print(i)print(i[0],i[1],i[2])print(i[0][1][2])[('23', [4449, 4415, 8864])
输出了
('23', [4449, 4415, 8864]) ('25', [4364, 4350, 8714]) ('33', [4113, 4145, 8258])
8864
('23', [4449, 4415, 8864]) ('25', [4364, 4350, 8714]) ('33', [4113, 4145, 8258])
8864

字典移除键值 .pop()

.pop("")

##给定一个字典,然后计算它们所有数字值的和。计算key 对应的键值
test_dict = {"Runoob" : 1, "Google" : 2, "Taobao" : 3, "Zhihu" : 4}del(test_dict['Zhihu'])
print (test_dict) test_dict = {"Runoob" : 1, "Google" : 2, "Taobao" : 3, "Zhihu" : 4} ### 输出原始的字典
##  #使用 pop 移除 Zhihu
removed_value = test_dict.pop('Zhihu',"有无此数")
print(removed_value)tmp=test_dict.pop('Google')
print(tmp)# 输出移除后的字典
print ("字典移除后 : " + str(test_dict)) print ("移除的 key 对应的 value 为 : " + str(removed_value)) print ('\r') ##使用 pop() 移除没有的 key 不会发生异常,我们可以自定义提示信息
removed_value = test_dict.pop('Baidu', '没有该键(key)') 
tmp=test_dict.pop("Google")
print(tmp)
print(str(tmp))
print(str(test_dict))# 输出移除后的字典
print ("字典移除后 : " + str(test_dict)) 
print ("移除的值为 : " + str(removed_value))print("..######..\n..#....#..\n..######..", end = " ") 

字典到列表的转化

字典-键值-排序

字典排序不是你看的那个样子,字典排序有那个固定的键值一一对应。建议转化为列表。

当转化为列表的时候,发现确实成功了。字典和列表的转化如下:

dic1={"北京":34,"上海":34,"广州":32}
list1=list(dic1.items())
print("字典到列表",list1)myList.sort(key=lambda x:x[1],reverse=True)#按照列表排列

字典-get(,)+1

运行如下代码:

myDict={2:23,3:333,4:444}
print(myDict.get(5,0))#运行结果为:0   因为字典中的键key 并没有5。
print(myDict.get(2))#运行结果为:23
#如果字典中没有该键,则用get(, ),输出了myvalue

但是,可以优化以上代码,尝试下面的格式

for ilist in mylist:xing=ilist[0]xingDict[xing]=xingDict.get(xing,0)+1#如此步骤更少。

.items()

resultFile=open("/mytest.txt","w",encoding="utf-8")
numDict={x:x**2 for x in range(1,20) if x%2==0}for (k,v) in numDict.items():print((k,v),file=resultFile)
resultFile.close()for x in range(20,1,-1):print(x)

字典-列表解析-ord-chr

用字典生成字母,并用列表对应。

myDict={chr(x):0 for x in range(ord("a"),ord("z")+1)}
myList=list(myDict.items())
print(myList)

生成结果如下:

[('a', 0), ('b', 0), ('c', 0), ('d', 0), ('e', 0), ('f', 0), ('g', 0), ('h', 0), ('i', 0), ('j', 0), ('k', 0), ('l', 0), ('m', 0), ('n', 0), ('o', 0), ('p', 0), ('q', 0), ('r', 0), ('s', 0), ('t', 0), ('u', 0), ('v', 0), ('w', 0), ('x', 0), ('y', 0), ('z', 0)]

可以直接生成字典形式,并转化为列表。

利用字典进行词频统计

简要代码如下:

#灵活运用字典的键,值。可以分开操作。
list=[2,4,5,3,2,6,4]
dic1={2:34,3:34,9:32}
for key in dic1:if key in list:                             #想出这样简洁的语法不容易。dic1[key]=0
print(dic1)
#参看作业—名著词频统计。

利用字典统计文本数据,具体案例如下

#打开两个文件
storyFile=open("TheStoryoftheStone.txt","r",encoding="utf-8")#打开文件
guyFile=open("AllMenAreBrothers.txt","r",encoding="utf-8")#打开文件storyData=storyFile.read() #读取数据,字符串
guyData=guyFile.read() #读取数据,字符串storyFile.close() #关闭文件
guyFile.close() #关闭文件storyHZ={} #红楼梦的字频for singleHZ in storyData:if singleHZ==" " or singleHZ =="\n":continueif singleHZ not in storyHZ:storyHZ[singleHZ]=1else:storyHZ[singleHZ]+=1#print(storyHZ)guyHZ={} #水浒传的字频for singleHZ in guyData:if singleHZ==" " or singleHZ =="\n":continueif singleHZ not in guyHZ:guyHZ[singleHZ]=1else:guyHZ[singleHZ]+=1#print(guyHZ)totalHZ={}#计算所有汉字的集合
allHZ=set(storyHZ)|set(guyHZ) #得到所有汉字的集合#得到每个汉字分别在红楼梦和水浒传中出现的次数
for singleHZ in allHZ:if singleHZ==" " or singleHZ =="\n":continue#计算出在红楼梦对应汉字的次数storySingleHZCount=0 #保存红楼梦中汉字出现次数if singleHZ in storyHZ:storySingleHZCount=storyHZ[singleHZ]#计算出在《水浒传》对应汉字的次数guySingleHZCount=0 #保存水浒传中汉字出现次数if singleHZ in guyHZ:guySingleHZCount=guyHZ[singleHZ]#合并存入字典totalHZ[singleHZ]=(storySingleHZCount,guySingleHZCount)print(totalHZ.items())#排序
dataAfterSorted=list(sorted(totalHZ.items(),key=lambda X:X[1][0]+X[1][1],reverse=True))
#print(dataAfterSorted[:100])wFile=open(r"D:\PKU\课堂教学\2019Autumn\FCA\课堂程序\20191030\HZ.txt","w",encoding="utf-8")for everyHZ in dataAfterSorted[:100]:print(everyHZ[0],everyHZ[1][0],everyHZ[1][1],sep=",",file=wFile) #输出到文件wFile.close() #关闭文件

字典的多级结构

字典-多级结构-离婚率和结婚率.

#读入数据
rFile=open("E:\\编程\\课堂老师程序\\20191106\\Marriage2007_2018.txt","r",encoding="utf-8")
content=rFile.read()
rFile.close()#输出内容看一看
#print(content)#按行切分
dataLines=content.splitlines()
print(dataLines)hunDict={} #结婚的数据!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
for everyLine in dataLines:tmpData=everyLine.split(",")Sheng=tmpData[0]dataDict={}#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!print(Sheng)for Year in range(2007,2019):dataLine=tmpData[Year-2007+1]dataLine=dataLine.split("_")onData=int(dataLine[0])offData=int(dataLine[1])print(Year,onData,offData)dataDict[Year]=(onData,offData)hunDict[Sheng]=dataDict
print(hunDict)wFile=open("D:\marriage.txt","w",encoding="utf-8")
firstLine=","
for Year in range(2007,2019):firstLine+=str(Year)+"年,"
print(firstLine[:-1],file=wFile)for everySheng in hunDict:lineData=everySheng+","for everyYear in range(2007,2019):onData=hunDict[everySheng][everyYear][0]offData=hunDict[everySheng][everyYear][1]lineData+=str(round(offData/onData,4)*100)+"%"+","print(lineData[:-1],file=wFile)wFile.close()#eof

利用字典处理50万人名

#打开文件的函数。
def Open():file=open('50万人名.txt','r',encoding='utf-8')mydata=file.read()file.close()datalist=mydata.splitlines()mylist=[]                       #姓名的列表for everyStr in datalist:newStr=list(everyStr)mylist.append(newStr)       #将姓和名分开。xinglist=[]                     #姓的列表。姓一个一个放到列表minglist=[]for everylist in mylist:xinglist.append(everylist[0])minglist.append(everylist[1:])return xinglist,minglist'''
#第一和第二函数,可以合并直接在字典中解决。
xingDic={}
for ilist in mylist:xing=ilist[0]xingDict[xing]=xingDict.get(xing,0)+1#如此步骤更少。namelen=len(ilist)if namelen==3:if ilist[1]==ilist[2]:repeatStr=ilist[1]+ilist[2]repeatDic[repeatStr]=repeatDic.get(xing,0)+1passelif namelen==4:if ilist[1]==ilist[2]passif ilist[2]==ilist[3]pass'''#统计姓的函数
def Xing():xinglist=Open()[0]myDict={}#姓的字典    for Xing in xinglist:myDict[Xing]=myDict.get(Xing,0)+1tmplist=list(myDict.items())tmplist.sort(key=lambda x:x[1],reverse=True)#print(tmpList[:200]),输出前200姓for i in tmplist:print(i[0],i[1],sep=',')#统计常用字的函数
def ChangYongZi():mylist=[]minglist=Open()[1]for ilist in minglist:for everyChr in ilist:mylist.append(everyChr)myDic={}for i in mylist:myDic[i]=myDic.get(i,0)+1tmplist=list(myDic.items())tmplist.sort(key=lambda x:x[1],reverse=True)for i in tmplist[:200]:print(i[0],i[1],sep='\t')#统计重名的函数
def ChongMing():minglist=Open()[1]afterMinglist=list(filter(lambda x:len(x)>=2,minglist))#名中有两个字重复。说明名字长队得大于等于3chongMinglist=[]for i in afterMinglist:if len(i)==2 and i[0]==i[1]:chongMinglist.append(i)#i[1:]输出姓后面的名,刨除姓。elif len(i)==3 and (i[0]==i[1] or i[1]==i[2]):chongMinglist.append(i[1:])#print(chongMinglist)。重名的各个字在列表中。输出['宸', '宸'], ['东', '东']tmplist=[]for ilist in chongMinglist:ming=''.join(ilist)tmplist.append(ming)#print(tmplist),输出'瑶瑶', '盈盈myDict={}for ming in tmplist:myDict[ming]=myDict.get(ming,0)+1newlist=list(myDict.items())newlist.sort(key=lambda x:x[1],reverse=True)for i in newlist[:200]:print(i[0],i[1],sep='\t')#输出前200对myfile=open('1700014174_work01.txt','w',encoding='utf-8')
ChangYongZi()
Xing()
ChongMing()
myfile.close()

简单数据 的数学应用 中位数

下面这段代码是错误的:

myList=[12,34,45]def Zhong():N=len(myList)mySum=sum(myList)if N%2==0:tmp=myList[N//2-1]+myList[N//2]zhong=tmp//2else:zhong=myList[(N+1)//2-1]return zhongprint(Zhong())

注意:中位数,要先把所给各个数字按照大小顺序排列。这是中位数的定义。所以修改代码如下:


'''
myList=[12,34,45,34,34,4]def fZhongWei(myList):myList.sort()N=len(myList)if N%2==1:zhong=myList((N+1)//2)else:zhong=(myList(N//2)+myList(N//2-1))//2#应该是方括号return zhongzhong=fZhongWei(myList)
print(zhong)#列表。
'''myList=[1,2,4,5]def fZhongWei(myList):myList.sort()N=len(myList)if N%2==1:zhong=myList[(N+1)//2-1]else:zhong=(myList[N//2-1]+myList[N//2])//2return zhongzhong=fZhongWei(myList)
print(zhong)

质因数 质数

求质数或者质因数,代码如下:

def isPrime(N):if N<2:return Falseif N==2:return Truefor i in range(2,N):if N%i==0:return Falseelse:return TrueprimeList=[i for i in range(1,1000) if isPrime(i)==True]def myIsPrime(N):i=2while i1:#while条件可以尝试以最后结果为准。求质数,从2,3……一直到N,i1if N%i==0:             N=N//imyList.append(i)print(i)input()else:i+=1return myListprint(ZhiYinShu(1468))

寻找质因数,需要用到循环语句。


def ZhiYinShu(N):i=2myList=[]while N>1:#while条件可以尝试以最后结果为准。求质数,从2,3……一直到N,i1if N%i==0:             N=N//imyList.append(i)print(i)input()else:i+=1return myListN=int(input(":"))
for i in range(2,N):#for循环中i每次自动加了1,while循环中i得手动加1while N>1:if N%i==0:        print(i)N=N//icontinueelse:breakelse:break

随机数

随机数,需要用大函数。

import random
print(random.choice(range(1,1000)))#输出1000以内的随机数

复杂数据 图片 文本

文本数据本质就是字符串。对于字符串的操作,可以应用到文本上面。

打开docx文本


import docx
file=docx.Document("test.docx")wfile=open("result.txt","w",encoding="utf-8")
for p in file.paragraphs:line=p.textif "=" in line:line=line.strip("=")print(line,file=wfile)else:myline=line.split(" ")for word in myline[3:]:print("\t",word,sep=" ",end="",file=wfile)print("\n",file=wfile)wfile.close()

对文本切分

按照行,按照逗号切分

rFile=open("HZ.txt","r",encoding="utf-8")
rData=rFile.read()
rFile.close()rData=rData.split("\n")
for line in rData[:10]:tmp=line.split(",")print(tmp[0],"对应的数据是",tmp[1],tmp[2])

文言文词频统计


def InterTxt():import jieba#中文文章需要分词才能进行词频统。一定一定主义,我把安装包移动到了同一个文件夹后,才能使用import调用jiebafile1=open("file1.txt","rt",encoding="GB18030")file2=open("file2.txt","rt",encoding="GB18030")data1=file1.read()#这个read必不可少,可以连在一起写f=open("Hong.txt","rt",encoding="utf-8").read()data2=file2.read()file1.close()file2.close()for ch in '!,.:;[\\]@&*:“”‘’;/~`?,● 。· ampqot?、\u3000 \\u()!\n<>hr【】&,l,;,《》1,2,3,4,5,,6,5,8.9,0.,7,':data1 = data1.replace(ch, "")data2 = data2.replace(ch, "")words1=jieba.lcut(data1)words2=jieba.lcut(data2)#print(words1)return words1,words2
InterTxt()def WordCOuntDic():words=InterTxt()word1=words[0]word2=words[1]counts1={}counts1Singel={}for word in word1:if len(word) == 1:if word in counts1Singel:counts1Singel[word]+=1else:counts1Singel[word]=1else:if word in counts1:counts1[word] += 1else:counts1[word] = 1counts2={}counts2Singe2={}#单字词单独列一个表格for word in word2:if len(word)==1:if word in counts2Singe2:counts2Singe2[word]+=1else:counts2Singe2[word]=1else:if word in counts2:counts2[word] += 1else:counts2[word] = 1#excludes={"什么","一个","我们","那里",}#删去不需要的词#for word in excludes:  # 注意,excludes 字典中的词语得存在,如果不存在,遍历循环的时候就会出错。#del(counts1[word])#del(counts2[word])#del(counts2Singe2[word])#del(counts1Singel[word])items1=list(counts1.items())#把字典列表化,便于使用items2=list(counts2.items())#把字典列表化,便于使用items1Singel=list(counts1Singel.items())items2singel=list(counts2Singe2.items())items1.sort(key=lambda x:x[1],reverse=True)#排序,按照数量items2.sort(key=lambda x:x[1],reverse=True)  # 排序,按照数量items1Singel.sort(key=lambda x:x[1],reverse=True)items2singel.sort(key=lambda x:x[1],reverse=True)#print(items1)return items1,items2,items1Singel,items2singeldef OutResult():items=WordCOuntDic()itemsDouble1=items[0]itemsDouble2=items[1]itemsSingle1=items[2]itemsSingle2=items[3]wFile= open("GuDaiCiyu.txt", "wt", encoding="GB18030")  # r,raw的格式。print("文件一:",file=wFile)for everyWord in itemsDouble1[:100]:print(everyWord[0], everyWord[1], file=wFile)for everyWord in itemsSingle1[:100]:print(everyWord[0], everyWord[1], file=wFile)print("文件二:",file=wFile)for everyWord in itemsDouble2[:100]:print(everyWord[0], everyWord[1], file=wFile)for everyWord in itemsSingle2[:100]:print(everyWord[0], everyWord[1], file=wFile)wFile.close()OutResult()

关于我们

最火推荐

小编推荐

联系我们


版权声明:本站内容由互联网用户自发贡献,该文观点仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌抄袭侵权/违法违规的内容, 请发送邮件至 88@qq.com 举报,一经查实,本站将立刻删除。备案号:桂ICP备2021009421号
Powered By Z-BlogPHP.
复制成功
微信号:
我知道了