Python数据类型结构
对于CSDN中的代码,选择“复制代码”,然后在CMD中打开,选择Ctrl + V,即可直接运行查看结果。
程序,处理的全部是数据.
数据类型结构 字符串
设定映射关系进行转 换。
python">transTab=str.maketrans("0123456789","零壹贰叁肆伍陆柒捌玖")
print("879".translate(transTab))
行切分,形成行的list。将字符串从分行位置 处切分形成list。分行位置包括但不限于: \n、\r\n、\v、\f等
print("蒹葭苍苍,\n白露为霜。\n所谓伊人,\n在水一方。".splitlines())
切分strO所代表的字符串
print("大江东去,浪淘尽,千古风流人物".partition(","))
列表 列表中的切片
列表最重要的是切片。下面这段代码,是提取电脑文件夹中的xls文件名称,并将其整齐化。
写如下代码:
import os
import openpyxlos.chdir("E:\scoretest")
subList=os.listdir(os.getcwd())for isen in subList:#先从文件中取中学的名字。name=isen.split("中")aSen=name[0]bSen=aSen+"中"if bSen[-3:]=="高级中":school=bSen+"学"elif bSen[-3:]=="高中学":school=bSen[:-1]elif bSen[-3]=="第":school=bSen+"学"else:school=bSenprint(school)
这里运用到了列表中的切片。[ ]是 左闭右开。举例如下:
mylist=[1,3,4,5]
print(mylist[0:2])#[1, 3]
通过列表生成ID号码
myList=["002020072"+str(i).rjust(3,"0") for i in range(1,23)]
for i in myList:print(i)
元组
元组不可改。
符号表示:( )
#一定搞清楚,元组不可改,在列表中的数个元组,改的是列表中的元素。
#字典,couple,都是新的英文单词而已,知道存在的基本原理和道理。#把python得到的成绩txt文本导入到Excel中,可以做表格。很方便和神奇啊。
list1=[('楔', 0), ('子', 2967), ('张', 1455), ('天', 1954), ('师', 1132), ('祈', 19), ('禳', 0)]for i in list1:print("{},{}".format(i[0],i[1]),end=" ")
集合
符号表示:{ } 。空的 集合用 ()。
集合的基本操作
#set用于高速的增删改查。但是相应的,他的空间就需要大一些。
set()#可以得到汉字的集合。myStr="红军不怕远征难,万水千山只等闲。"
#集合最重要的特征就是没有重复。他会变成一个一个的。
mySet=set(myStr)
print(mySet)
yourSet={"红军不怕远征难,万水千山只等闲。"}
yourSet.add("apple") #集合中用add而不是用append
#set不保证你的顺序。他的最重要特点就是不保证序,查找的速度相当快。list量大的时候,查找的速度比较慢。
set=()#空的集合用括号,而不是花括号。单独的花括号有别的用途。
heLike={'ro','la',"so"}
#set用于高速的增删改查。但是相应的,他的空间就需要大一些。集合函数的掌握得多多实践。差集,交集,子集等需要实现。
#集合是无序不重复的元素的集合。
#集合的元素没法更改。只有改成list才能够改。
#取的键的集合
#注意,虽然都用了花括号,字典和集合不同。字典有映射关系。
#集合是无序不重复的元素的集合。
set1={('汪', 3), ('盛', 155), ('驸', 40), }
set2={('烦', 166), ('磷', 1), ('宁', 183), ('蛟', 1), ('欹', 2), ('钝', 3), ('厦', 22), ('鳏', 2), ('方', 1132), ('尸', 43), ('哀', 37)}#集合的元素没法更改。只有改成list才能够改。
list1=list(set1)
for i in list1:i=list(i)
print(list1)##??此处行不通,不能将集合中的元素( )转为列表[ ]
print("OK")
a=('烦', 166)
set1.add(a)
print(set1)#添加,只能是单个元素。a=[('茅', 15), ('狼', 115), ('服', 220), ('拚', 24), ('族', 13)]
set1.update(a)
print(set1)#添加,update多个元素b=('宁', 183)
set2.remove(b)#删除
print(set2)print(set1,set2)#单纯罗列两个集合print(set1|set2)#求并集
print(set1.union(set2))print(set2&set1)#求交集
print(set1.intersection(set2))print(set1-set2)#求差集''''....(后面部分。名著-词频统计)Set1 = set(list1)#把列表变成了集合。#print(Set1)return Set1,dic1,list1#return返回值是和外界沟通的,调用函数后,再定义一个值把函数中的值接着。同时return可以返回多个函数值。data=TXT(txt1)##!!!! !!!
set1=data[0]
dic1=data[1]
list1=data[2]data=TXT(txt2)
set2=data[0]
dic2=data[1]
list2=data[2]#inter=charSet1.intersection(charSet2)
#union=charSet1.union(charSet2)
#字相等,出现次数不一致,在交集中仍然存在。所以此路不同。
#set1zero=charSet1-inter#某个字在其中一部小说出现,但在另外一部小说不出现,没有出现的小说记为 0得到的集合。
#set2zero=charSet2-inter
#print(len(set1zero))listZero=[]
for i in set1zero:listZero.append(i[0])
print(listZero)#print("并集",union)
'''
#myFile.close()
集合是无序的
list1=[i for i in range(10)]+[i for i in range(10)]
print("My list is :",list1)
MySet=set(list1)
print("My set is:",MySet)
MyTuple=tuple(list1)
print("My tuple is:",MyTuple)
HisSet=MySet.pop()
YourSet=MySet.clear()
print("Your Set is:",YourSet)
print("His Set is:",HisSet)
字典 字典的基本操作
#字典应用示例
dictA={"Apple":"苹果","Computer":"计算机","Mouse":"鼠标","Mobile":"手机"}
print(list(dictA)) #将字典的键转换为list类型
print(list(dictA.items())) #将字典的键和值组装成tuple,然后转换为list
print(list(dictA.keys())) #将字典的键转换为list类型
print(list(dictA.values())) #将字典的值转换为list类型#engItem对应字典的键,chnItem对应字典的值
#相当于一个tuple对应字典items()函数的一个个结果
for engItem,chnItem in dictA.items():print(engItem,"-->",chnItem)wFile=open("..\ABC.txt","w",encoding="utf-8")
print(dictA,file=wFile)
wFile.close()
两个字典的相加
#两个字典的相加
dicA={'a':1,'b':2,'c':3,'d':4,'f':"hello"}
dicB={'b':3,'d':5,'e':7,'m':9,'k':'world'}
def addDic(dicA,dicB):newDic={}for key in dicA:if dicB.get(key):newDic[key]=dicA[key]+dicB[key]else:newDic[key]=dicA[key]for key in dicB:if dicA.get(key):passelse:newDic[key]=dicB[key]return newDic
print(addDic(dicA,dicB))'''
print(dic.items())#会把这些值弄成元组。
print(dic.keys)
print(dic.values)print(list(dic.items()))#会把这些值弄成元组。
print(list(dic.keys))
print(list(dic.values))#用item 的方法更加直观一些。#找到年龄最大的人,并输出。请找出程序中有什么问题。
person = {"李先生":18,"王先生":50,"张先生":20,"孙先生":22}
for key in person.keys():print(key)for value in person.values():print(value)for item in person.items():print(item)for i in person:print(i)m='李先生'
for key in person.keys():if person[m]<=person[key]:m=key# 把大的值赋给了m
print("最大的人是{},年龄是{}。".format(m,person[m]))字典的输出通常也转化为列表。因为字典太特殊了。d = {123:"123", 456:"456", 789:"789"}
key = []
value = []
for i in d.items():key.append(i[0])value.append(i[1])
print(key)
print(value)
'''mydic={'地铁6号线': [('23', [4449, 4415, 8864]), ('25', [4364, 4350, 8714]), ('33', [4113, 4145, 8258]), ('29', [3786, 3990, 7776]), ('41', [3856, 3835, 7691]), ('27', [3722, 3920, 7642]), ('31', [3536, 3453, 6989]), ('35', [3477, 3468, 6945]), ('37', [3346, 3469, 6815]), ('39', [3305, 3332, 6637]), ('21', [3347, 3148, 6495]), ('43', [2819, 2794, 5613]), ('3', [2726, 2716, 5442]), ('13', [2681, 2661, 5342]), ('59', [2458, 2387, 4845]), ('61', [2426, 2315, 4741]), ('51', [2237, 2222, 4459]), ('55', [2134, 2227, 4361]), ('45', [2174, 2145, 4319]), ('5', [2246, 2035, 4281]), ('49', [2112, 2134, 4246]), ('53', [2004, 2178, 4182]), ('14', [1811, 1925, 3736]), ('15', [1909, 1773, 3682]), ('63', [1869, 1746, 3615]), ('9', [1796, 1674, 3470]), ('7', [1735, 1643, 3378]), ('65', [1702, 1586, 3288]), ('47', [1688, 1594, 3282]), ('18', [1583, 1566, 3149]), ('57', [1480, 1539, 3019]), ('11', [1442, 1461, 2903]), ('6', [1535, 1347, 2882]), ('8', [1307, 1357, 2664]), ('12', [1239, 1409, 2648]), ('1', [1193, 1187, 2380]), ('19', [1100, 1124, 2224]), ('67', [957, 909, 1866]), ('22', [837, 933, 1770]), ('17', [833, 859, 1692]), ('20', [789, 856, 1645]), ('4', [789, 795, 1584]), ('16', [641, 620, 1261]), ('10', [607, 636, 1243]), ('24', [568, 641, 1209]), ('79', [544, 628, 1172]), ('69', [402, 372, 774]), ('2', [295, 373, 668]), ('83', [299, 338, 637]), ('77', [280, 286, 566]), ('85', [262, 294, 556]), ('89', [271, 262, 533]), ('75', [230, 239, 469]), ('91', [207, 216, 423]), ('73', [202, 193, 395]), ('87', [133, 126, 259]), ('71', [117, 133, 250]), ('93', [29, 20, 49])], '地铁7号线': [('23', [4449, 4415, 8864]), ('25', [4364, 4350, 8714]), ('33', [4113, 4145, 8258]), ('29', [3786, 3990, 7776]), ('41', [3856, 3835, 7691]), ('27', [3722, 3920, 7642]), ('31', [3536, 3453, 6989]), ('35', [3477, 3468, 6945]), ('37', [3346, 3469, 6815]), ('39', [3305, 3332, 6637]), ('21', [3347, 3148, 6495]), ('43', [2819, 2794, 5613]), ('3', [2726, 2716, 5442]), ('13', [2681, 2661, 5342]), ('59', [2458, 2387, 4845]), ('61', [2426, 2315, 4741]), ('51', [2237, 2222, 4459]), ('55', [2134, 2227, 4361]), ('45', [2174, 2145, 4319]), ('5', [2246, 2035, 4281]), ('49', [2112, 2134, 4246]), ('53', [2004, 2178, 4182]), ('14', [1811, 1925, 3736]), ('15', [1909, 1773, 3682]), ('63', [1869, 1746, 3615]), ('9', [1796, 1674, 3470]), ('7', [1735, 1643, 3378]), ('65', [1702, 1586, 3288]), ('47', [1688, 1594, 3282]), ('18', [1583, 1566, 3149]), ('57', [1480, 1539, 3019]), ('11', [1442, 1461, 2903]), ('6', [1535, 1347, 2882]), ('8', [1307, 1357, 2664]), ('12', [1239, 1409, 2648]), ('1', [1193, 1187, 2380]), ('19', [1100, 1124, 2224]), ('67', [957, 909, 1866]), ('22', [837, 933, 1770]), ('17', [833, 859, 1692]), ('20', [789, 856, 1645]), ('4', [789, 795, 1584]), ('16', [641, 620, 1261]), ('10', [607, 636, 1243]), ('24', [568, 641, 1209]), ('79', [544, 628, 1172]), ('69', [402, 372, 774]), ('2', [295, 373, 668]), ('83', [299, 338, 637]), ('77', [280, 286, 566]), ('85', [262, 294, 556]), ('89', [271, 262, 533]), ('75', [230, 239, 469]), ('91', [207, 216, 423]), ('73', [202, 193, 395]), ('87', [133, 126, 259]), ('71', [117, 133, 250]), ('93', [29, 20, 49])]}
#print(mydic)
alllist=[]
valuelist=[]for i in mydic.items():valuelist.append(i[1])alllist.append(i)
for i in keylist:print(i)for i in valuelist:#print(i)print(i[0],i[1],i[2])print(i[0][1][2])[('23', [4449, 4415, 8864])
输出了
('23', [4449, 4415, 8864]) ('25', [4364, 4350, 8714]) ('33', [4113, 4145, 8258])
8864
('23', [4449, 4415, 8864]) ('25', [4364, 4350, 8714]) ('33', [4113, 4145, 8258])
8864
字典移除键值 .pop()
.pop("")
##给定一个字典,然后计算它们所有数字值的和。计算key 对应的键值
test_dict = {"Runoob" : 1, "Google" : 2, "Taobao" : 3, "Zhihu" : 4}del(test_dict['Zhihu'])
print (test_dict) test_dict = {"Runoob" : 1, "Google" : 2, "Taobao" : 3, "Zhihu" : 4} ### 输出原始的字典
## #使用 pop 移除 Zhihu
removed_value = test_dict.pop('Zhihu',"有无此数")
print(removed_value)tmp=test_dict.pop('Google')
print(tmp)# 输出移除后的字典
print ("字典移除后 : " + str(test_dict)) print ("移除的 key 对应的 value 为 : " + str(removed_value)) print ('\r') ##使用 pop() 移除没有的 key 不会发生异常,我们可以自定义提示信息
removed_value = test_dict.pop('Baidu', '没有该键(key)')
tmp=test_dict.pop("Google")
print(tmp)
print(str(tmp))
print(str(test_dict))# 输出移除后的字典
print ("字典移除后 : " + str(test_dict))
print ("移除的值为 : " + str(removed_value))print("..######..\n..#....#..\n..######..", end = " ")
字典到列表的转化
字典-键值-排序
字典排序不是你看的那个样子,字典排序有那个固定的键值一一对应。建议转化为列表。
当转化为列表的时候,发现确实成功了。字典和列表的转化如下:
dic1={"北京":34,"上海":34,"广州":32}
list1=list(dic1.items())
print("字典到列表",list1)myList.sort(key=lambda x:x[1],reverse=True)#按照列表排列
字典-get(,)+1
运行如下代码:
myDict={2:23,3:333,4:444}
print(myDict.get(5,0))#运行结果为:0 因为字典中的键key 并没有5。
print(myDict.get(2))#运行结果为:23
#如果字典中没有该键,则用get(, ),输出了myvalue
但是,可以优化以上代码,尝试下面的格式
for ilist in mylist:xing=ilist[0]xingDict[xing]=xingDict.get(xing,0)+1#如此步骤更少。
.items()
resultFile=open("/mytest.txt","w",encoding="utf-8")
numDict={x:x**2 for x in range(1,20) if x%2==0}for (k,v) in numDict.items():print((k,v),file=resultFile)
resultFile.close()for x in range(20,1,-1):print(x)
字典-列表解析-ord-chr
用字典生成字母,并用列表对应。
myDict={chr(x):0 for x in range(ord("a"),ord("z")+1)}
myList=list(myDict.items())
print(myList)
生成结果如下:
[('a', 0), ('b', 0), ('c', 0), ('d', 0), ('e', 0), ('f', 0), ('g', 0), ('h', 0), ('i', 0), ('j', 0), ('k', 0), ('l', 0), ('m', 0), ('n', 0), ('o', 0), ('p', 0), ('q', 0), ('r', 0), ('s', 0), ('t', 0), ('u', 0), ('v', 0), ('w', 0), ('x', 0), ('y', 0), ('z', 0)]
可以直接生成字典形式,并转化为列表。
利用字典进行词频统计
简要代码如下:
#灵活运用字典的键,值。可以分开操作。
list=[2,4,5,3,2,6,4]
dic1={2:34,3:34,9:32}
for key in dic1:if key in list: #想出这样简洁的语法不容易。dic1[key]=0
print(dic1)
#参看作业—名著词频统计。
利用字典统计文本数据,具体案例如下
#打开两个文件
storyFile=open("TheStoryoftheStone.txt","r",encoding="utf-8")#打开文件
guyFile=open("AllMenAreBrothers.txt","r",encoding="utf-8")#打开文件storyData=storyFile.read() #读取数据,字符串
guyData=guyFile.read() #读取数据,字符串storyFile.close() #关闭文件
guyFile.close() #关闭文件storyHZ={} #红楼梦的字频for singleHZ in storyData:if singleHZ==" " or singleHZ =="\n":continueif singleHZ not in storyHZ:storyHZ[singleHZ]=1else:storyHZ[singleHZ]+=1#print(storyHZ)guyHZ={} #水浒传的字频for singleHZ in guyData:if singleHZ==" " or singleHZ =="\n":continueif singleHZ not in guyHZ:guyHZ[singleHZ]=1else:guyHZ[singleHZ]+=1#print(guyHZ)totalHZ={}#计算所有汉字的集合
allHZ=set(storyHZ)|set(guyHZ) #得到所有汉字的集合#得到每个汉字分别在红楼梦和水浒传中出现的次数
for singleHZ in allHZ:if singleHZ==" " or singleHZ =="\n":continue#计算出在红楼梦对应汉字的次数storySingleHZCount=0 #保存红楼梦中汉字出现次数if singleHZ in storyHZ:storySingleHZCount=storyHZ[singleHZ]#计算出在《水浒传》对应汉字的次数guySingleHZCount=0 #保存水浒传中汉字出现次数if singleHZ in guyHZ:guySingleHZCount=guyHZ[singleHZ]#合并存入字典totalHZ[singleHZ]=(storySingleHZCount,guySingleHZCount)print(totalHZ.items())#排序
dataAfterSorted=list(sorted(totalHZ.items(),key=lambda X:X[1][0]+X[1][1],reverse=True))
#print(dataAfterSorted[:100])wFile=open(r"D:\PKU\课堂教学\2019Autumn\FCA\课堂程序\20191030\HZ.txt","w",encoding="utf-8")for everyHZ in dataAfterSorted[:100]:print(everyHZ[0],everyHZ[1][0],everyHZ[1][1],sep=",",file=wFile) #输出到文件wFile.close() #关闭文件
字典的多级结构
字典-多级结构-离婚率和结婚率.
#读入数据
rFile=open("E:\\编程\\课堂老师程序\\20191106\\Marriage2007_2018.txt","r",encoding="utf-8")
content=rFile.read()
rFile.close()#输出内容看一看
#print(content)#按行切分
dataLines=content.splitlines()
print(dataLines)hunDict={} #结婚的数据!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
for everyLine in dataLines:tmpData=everyLine.split(",")Sheng=tmpData[0]dataDict={}#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!print(Sheng)for Year in range(2007,2019):dataLine=tmpData[Year-2007+1]dataLine=dataLine.split("_")onData=int(dataLine[0])offData=int(dataLine[1])print(Year,onData,offData)dataDict[Year]=(onData,offData)hunDict[Sheng]=dataDict
print(hunDict)wFile=open("D:\marriage.txt","w",encoding="utf-8")
firstLine=","
for Year in range(2007,2019):firstLine+=str(Year)+"年,"
print(firstLine[:-1],file=wFile)for everySheng in hunDict:lineData=everySheng+","for everyYear in range(2007,2019):onData=hunDict[everySheng][everyYear][0]offData=hunDict[everySheng][everyYear][1]lineData+=str(round(offData/onData,4)*100)+"%"+","print(lineData[:-1],file=wFile)wFile.close()#eof
利用字典处理50万人名
#打开文件的函数。
def Open():file=open('50万人名.txt','r',encoding='utf-8')mydata=file.read()file.close()datalist=mydata.splitlines()mylist=[] #姓名的列表for everyStr in datalist:newStr=list(everyStr)mylist.append(newStr) #将姓和名分开。xinglist=[] #姓的列表。姓一个一个放到列表minglist=[]for everylist in mylist:xinglist.append(everylist[0])minglist.append(everylist[1:])return xinglist,minglist'''
#第一和第二函数,可以合并直接在字典中解决。
xingDic={}
for ilist in mylist:xing=ilist[0]xingDict[xing]=xingDict.get(xing,0)+1#如此步骤更少。namelen=len(ilist)if namelen==3:if ilist[1]==ilist[2]:repeatStr=ilist[1]+ilist[2]repeatDic[repeatStr]=repeatDic.get(xing,0)+1passelif namelen==4:if ilist[1]==ilist[2]passif ilist[2]==ilist[3]pass'''#统计姓的函数
def Xing():xinglist=Open()[0]myDict={}#姓的字典 for Xing in xinglist:myDict[Xing]=myDict.get(Xing,0)+1tmplist=list(myDict.items())tmplist.sort(key=lambda x:x[1],reverse=True)#print(tmpList[:200]),输出前200姓for i in tmplist:print(i[0],i[1],sep=',')#统计常用字的函数
def ChangYongZi():mylist=[]minglist=Open()[1]for ilist in minglist:for everyChr in ilist:mylist.append(everyChr)myDic={}for i in mylist:myDic[i]=myDic.get(i,0)+1tmplist=list(myDic.items())tmplist.sort(key=lambda x:x[1],reverse=True)for i in tmplist[:200]:print(i[0],i[1],sep='\t')#统计重名的函数
def ChongMing():minglist=Open()[1]afterMinglist=list(filter(lambda x:len(x)>=2,minglist))#名中有两个字重复。说明名字长队得大于等于3chongMinglist=[]for i in afterMinglist:if len(i)==2 and i[0]==i[1]:chongMinglist.append(i)#i[1:]输出姓后面的名,刨除姓。elif len(i)==3 and (i[0]==i[1] or i[1]==i[2]):chongMinglist.append(i[1:])#print(chongMinglist)。重名的各个字在列表中。输出['宸', '宸'], ['东', '东']tmplist=[]for ilist in chongMinglist:ming=''.join(ilist)tmplist.append(ming)#print(tmplist),输出'瑶瑶', '盈盈myDict={}for ming in tmplist:myDict[ming]=myDict.get(ming,0)+1newlist=list(myDict.items())newlist.sort(key=lambda x:x[1],reverse=True)for i in newlist[:200]:print(i[0],i[1],sep='\t')#输出前200对myfile=open('1700014174_work01.txt','w',encoding='utf-8')
ChangYongZi()
Xing()
ChongMing()
myfile.close()
简单数据 的数学应用 中位数
下面这段代码是错误的:
myList=[12,34,45]def Zhong():N=len(myList)mySum=sum(myList)if N%2==0:tmp=myList[N//2-1]+myList[N//2]zhong=tmp//2else:zhong=myList[(N+1)//2-1]return zhongprint(Zhong())
注意:中位数,要先把所给各个数字按照大小顺序排列。这是中位数的定义。所以修改代码如下:
'''
myList=[12,34,45,34,34,4]def fZhongWei(myList):myList.sort()N=len(myList)if N%2==1:zhong=myList((N+1)//2)else:zhong=(myList(N//2)+myList(N//2-1))//2#应该是方括号return zhongzhong=fZhongWei(myList)
print(zhong)#列表。
'''myList=[1,2,4,5]def fZhongWei(myList):myList.sort()N=len(myList)if N%2==1:zhong=myList[(N+1)//2-1]else:zhong=(myList[N//2-1]+myList[N//2])//2return zhongzhong=fZhongWei(myList)
print(zhong)
质因数 质数
求质数或者质因数,代码如下:
def isPrime(N):if N<2:return Falseif N==2:return Truefor i in range(2,N):if N%i==0:return Falseelse:return TrueprimeList=[i for i in range(1,1000) if isPrime(i)==True]def myIsPrime(N):i=2while i1:#while条件可以尝试以最后结果为准。求质数,从2,3……一直到N,i1if N%i==0: N=N//imyList.append(i)print(i)input()else:i+=1return myListprint(ZhiYinShu(1468))
寻找质因数,需要用到循环语句。
def ZhiYinShu(N):i=2myList=[]while N>1:#while条件可以尝试以最后结果为准。求质数,从2,3……一直到N,i1if N%i==0: N=N//imyList.append(i)print(i)input()else:i+=1return myListN=int(input(":"))
for i in range(2,N):#for循环中i每次自动加了1,while循环中i得手动加1while N>1:if N%i==0: print(i)N=N//icontinueelse:breakelse:break
随机数
随机数,需要用大函数。
import random
print(random.choice(range(1,1000)))#输出1000以内的随机数
复杂数据 图片 文本
文本数据本质就是字符串。对于字符串的操作,可以应用到文本上面。
打开docx文本
import docx
file=docx.Document("test.docx")wfile=open("result.txt","w",encoding="utf-8")
for p in file.paragraphs:line=p.textif "=" in line:line=line.strip("=")print(line,file=wfile)else:myline=line.split(" ")for word in myline[3:]:print("\t",word,sep=" ",end="",file=wfile)print("\n",file=wfile)wfile.close()
对文本切分
按照行,按照逗号切分
rFile=open("HZ.txt","r",encoding="utf-8")
rData=rFile.read()
rFile.close()rData=rData.split("\n")
for line in rData[:10]:tmp=line.split(",")print(tmp[0],"对应的数据是",tmp[1],tmp[2])
文言文词频统计
def InterTxt():import jieba#中文文章需要分词才能进行词频统。一定一定主义,我把安装包移动到了同一个文件夹后,才能使用import调用jiebafile1=open("file1.txt","rt",encoding="GB18030")file2=open("file2.txt","rt",encoding="GB18030")data1=file1.read()#这个read必不可少,可以连在一起写f=open("Hong.txt","rt",encoding="utf-8").read()data2=file2.read()file1.close()file2.close()for ch in '!,.:;[\\]@&*:“”‘’;/~`?,● 。· ampqot?、\u3000 \\u()!\n<>hr【】&,l,;,《》1,2,3,4,5,,6,5,8.9,0.,7,':data1 = data1.replace(ch, "")data2 = data2.replace(ch, "")words1=jieba.lcut(data1)words2=jieba.lcut(data2)#print(words1)return words1,words2
InterTxt()def WordCOuntDic():words=InterTxt()word1=words[0]word2=words[1]counts1={}counts1Singel={}for word in word1:if len(word) == 1:if word in counts1Singel:counts1Singel[word]+=1else:counts1Singel[word]=1else:if word in counts1:counts1[word] += 1else:counts1[word] = 1counts2={}counts2Singe2={}#单字词单独列一个表格for word in word2:if len(word)==1:if word in counts2Singe2:counts2Singe2[word]+=1else:counts2Singe2[word]=1else:if word in counts2:counts2[word] += 1else:counts2[word] = 1#excludes={"什么","一个","我们","那里",}#删去不需要的词#for word in excludes: # 注意,excludes 字典中的词语得存在,如果不存在,遍历循环的时候就会出错。#del(counts1[word])#del(counts2[word])#del(counts2Singe2[word])#del(counts1Singel[word])items1=list(counts1.items())#把字典列表化,便于使用items2=list(counts2.items())#把字典列表化,便于使用items1Singel=list(counts1Singel.items())items2singel=list(counts2Singe2.items())items1.sort(key=lambda x:x[1],reverse=True)#排序,按照数量items2.sort(key=lambda x:x[1],reverse=True) # 排序,按照数量items1Singel.sort(key=lambda x:x[1],reverse=True)items2singel.sort(key=lambda x:x[1],reverse=True)#print(items1)return items1,items2,items1Singel,items2singeldef OutResult():items=WordCOuntDic()itemsDouble1=items[0]itemsDouble2=items[1]itemsSingle1=items[2]itemsSingle2=items[3]wFile= open("GuDaiCiyu.txt", "wt", encoding="GB18030") # r,raw的格式。print("文件一:",file=wFile)for everyWord in itemsDouble1[:100]:print(everyWord[0], everyWord[1], file=wFile)for everyWord in itemsSingle1[:100]:print(everyWord[0], everyWord[1], file=wFile)print("文件二:",file=wFile)for everyWord in itemsDouble2[:100]:print(everyWord[0], everyWord[1], file=wFile)for everyWord in itemsSingle2[:100]:print(everyWord[0], everyWord[1], file=wFile)wFile.close()OutResult()