- #coding=gbk
- import string
- import codecs
- import os,time
- import xlwt
- import xlrd
- from bs4 import BeautifulSoup
- from xlrd import open_workbook
- class LogMsg:
- def __init__(self,logfile,Level=0):
- try:
- import logging
- #self.logger = None
- self.logger = logging.getLogger()
- self.hdlr = logging.FileHandler(logfile)
- formatter = logging.Formatter("[%(asctime)s]: %(message)s","%Y%m%d %H:%M:%S")
- self.hdlr.setFormatter(formatter)
- self.logger.addHandler(self.hdlr)
- #logger.setLevel()
- if Level == 10:
- self.logger.setLevel(logging.DEBUG)
- elif Level == 20:
- self.logger.setLevel(logging.INFO)
- elif Level == 30:
- self.logger.setLevel(logging.WARNING)
- elif Level == 40:
- self.logger.setLevel(logging.ERROR)
- elif Level == 50:
- self.logger.setLevel(logging.CRITICAL)
- else:
- self.logger.setLevel(logging.NOTSET)
- except:
- print "log init error!"
- exit(1)
- def output(self,logInfo):
- Level = self.logger.getEffectiveLevel()
- try:
- if Level == 10:
- self.logger.debug(logInfo)
- elif Level == 20:
- self.logger.info(logInfo)
- elif Level == 30:
- self.logger.warning(logInfo)
- elif Level == 40:
- self.logger.error(logInfo)
- elif Level == 50:
- self.logger.critical(logInfo)
- else:
- self.logger.info(logInfo)
- except:
- print "log output error!"
- exit(1)
- def close(self):
- try:
- #logging.shutdown([self.hdlr])
- self.logger.removeHandler(self.hdlr)
- except:
- print "log closed error!"
- exit(1)
- Logtime = time.strftime("%Y%m%d%H%M%S",time.localtime())
- logFileTime = time.strftime("%Y%m%d",time.localtime())
- Logfile = '/data/pyExample/logs/htmlparser_%s.log' % logFileTime
- log = LogMsg(Logfile,20)
- DATAPATH = '/data/pyExample/'
- XLSname = 'dangjian_'+Logtime+'.xls'
- if __name__ == '__main__':
- wbk = xlwt.Workbook(encoding = 'gbk')
- sheet = wbk.add_sheet('基本内容导入模板')
- sheet.write(0,0,'内容类型 ')
- sheet.write(0,1,'栏目名称')
- sheet.write(0,2,'栏目编号')
- sheet.write(0,3,'内容名称')
- sheet.write(0,4,'时长')
- sheet.write(0,5,'关键字')
- sheet.write(0,6,'看点')
- sheet.write(0,7,'作者')
- sheet.write(0,8,'来源')
- sheet.write(0,9,'子内容1')
- sheet.write(0,10,'子内容2')
- xlsContent = []
- files = os.listdir(DATAPATH)
- k = 0
- for f in files:
- if os.path.splitext(f)[1] == '.html':
- content=[]
- log.output('当前文件:'+f)
- htmlFile =codecs.open(DATAPATH+f,'r','gbk')
- lines = htmlFile.readlines()
- if not lines:
- log.output ('not line')
- for line in lines:
- if line.strip()=='\\n':
- log.output('该处是空行')
- else:
- line = line.replace(' ','')
- soup = BeautifulSoup(line)
- for tdd in soup.findAll('td'):
- #print tdd.text.encode("gbk")
- content.append(tdd.text.encode("gbk"))
- #print line.encode('gbk')
- htmlFile.close()
- for i in content:
- print content.index(i),',',i
- log.output(i)
- log.output(content.index(i))
- print '----------------------------------------'
- folderName = content[6]
- contentName= content[4]
- duration = filter(str.isdigit, content[16])
- int_duration = string.atoi(duration)*60
- str_duration = "%i"%int_duration
- keyWord = content[6]
- desciption = content[36]
- videoName_1 = content[10]
- print folderName
- print contentName
- print str_duration
- print keyWord
- print desciption
- print videoName_1
- log.output('输出xls数据:'+','+folderName+',,'+contentName+','+str_duration+','+keyWord+','+desciption+',管理员,华数编辑,'+videoName_1+',,')
- print k
- sheet.write(k+1,0,'')
- sheet.write(k+1,1,folderName)
- sheet.write(k+1,2,'')
- sheet.write(k+1,3,contentName)
- sheet.write(k+1,4,str_duration)
- sheet.write(k+1,5,keyWord)
- sheet.write(k+1,6,desciption)
- sheet.write(k+1,7,'管理员')
- sheet.write(k+1,8,'华数编辑')
- sheet.write(k+1,9,videoName_1)
- sheet.write(k+1,10,'')
- k+=1
- wbk.save(DATAPATH + XLSname)
- print '========================================='
- #该片段来自于http://www.codesnippet.cn/detail/011120136875.html
来源: http://www.codesnippet.cn/detail/011120136875.html