在工作当中, 有时候我们知到染色体编号以及染色体起始终止坐标, 我们想知道这段序列是什么样的碱基.
其一, 我们一般用去 UCSC 的 genome browser 里面去查询 , 其实也可以从 UCSC 的接口去解析网页, 然后在提取序列信息
比如 chr17:7676091,7676196 , 那么我只需要构造下面一个网页地址 http://genome.ucsc.edu/cgi-bin/das/hg38/dna?segment=chr17:7676091,7676196
然后 hg38 可以更换成 hg19,dna?segment= 后面可以按照标准格式更换, 就可以返回我们想要的序列了. 现在对网页返回 一个 xml 格式的信息, 用 python 解析一下
- 1 import requests
- import re
- from bs4 import BeautifulSoup
- import xlwt
- import xlrd
- from xlutils.copy import copy
- import os ,sys
- #print(sys.path)
- cwd=os.getcwd()
- def gethtmlText(url):
- print("111111")
- try:
- header = {'user-agent': 'Mozilla/5.0'}
- r = requests.get(url,headers = header,timeout = 30 )
- r.raise_for_status()
- r.encoding =r.apparent_encoding
- print("get 222222222222222")
- return r.text
- except:
- return ""
- def fillDNAList(dnalist, HTML):
- # 使用正则表达式获取 dna 序列的头文件
- match = re.search('SEQUENCE([\s\S]*?version="1.00")', HTML)
- print("match ok")
- if match:
- dna_header = re.search('SEQUENCE([\s\S]*?version="1.00")', HTML)
- #print('10====>', dna_header.group())
- #dna_header 存到列表
- dnalist.append(dna_header.group())
- match = re.search('DNA.*?length="(\d*)"', HTML)
- if match:
- length_header= re.search('DNA.*?length="(\d*)"', HTML)
- #print('11=====>', length_header.group())
- dnalist.append(length_header.group())
- #使用 BeautifulSoup
- print("BeautifulSoup tag 属性 获取 dna 标签属性的字符串部分")
- soup = BeautifulSoup(HTML, 'html.parser')
- tag = soup.dna
- seq = soup.dna.string
- seq = seq.replace('\n','').upper() #换行符删除掉, 转换成大写
- # seq 存到列表
- dnalist.append(seq)
- print("final======>",dnalist)
- return dnalist
- def write_excell(dnalist,chrnum,pos):
- head = '>hg19' + ' ' + dnalist[0] + dnalist[1]
- f = xlwt.Workbook(encoding='utf-8', style_compression=0) #创建新的 Excel(新的 workbook)
- sheet = f.add_sheet('test8', cell_overwrite_ok=True) #创建新的表单
- # 先写第一行的头文件
- sheet.write(0,0,head)
- #再从第二行开始写, 每行写入 50 个字符串
- dna = dnalist[2]
- print('=====',dna,type(dna))
- for i in range(0,len(dna),50):
- sheet.write((int((i+1)/50))+1,0,dna[i:i+50])
- out_file = 'chrmosome%s_%s.xls'% (chrnum,pos)
- f.save(out_file)
- out_file_dir = os.path.join(cwd, out_file)
- return out_file_dir
- def modify_excell(out_file_dir,chrnum,pos):
- '''
- 改 Excel 表 (xlutils 模块)
- :return:
- '''
- rb = xlrd.open_workbook(out_file_dir) # 打开 out_file.xls 文件, 创建工作簿实例对象
- sheet = rb.sheet_by_index(0)
- nrow11 = sheet.cell_value(10, 0) #修改第 11 行第一列, 索引是 10,0
- # 根据需要截取原单元格里面的内容与需要添加的内容进行拼接
- new_nrow11 = '[' + nrow11
- # 同理操作 nrow12
- nrow12 = sheet.cell_value(11, 0)
- new_nrow12 = nrow12 + ']'
- wb = copy(rb)
- ws = wb.get_sheet(0)
- # 往单元格中写入拼接后的新字符串内容
- ws.write(10,0,new_nrow11)
- ws.write(11, 0, new_nrow12)
- modify_file = 'new_chrmosome%s_%s.xls' % (chrnum,pos)
- wb.save(modify_file)
- def main():
- hg19 = "hg19"
- chrnum = 17
- pos = 7676091
- start = pos - 500
- end = pos + 500
- position_DNA_list = []
- #url = "http://genome.ucsc.edu/cgi-bin/das/hg38/dna?segment=chr17:7676091,7676196"
- url = f"http://genome.ucsc.edu/cgi-bin/das/{hg19}/dna?segment=chr{chrnum}:{start},{end}"
- print(url)
- HTML = getHTMLText(url)
- dnalist = fillDNAList(position_DNA_list,HTML)
- out_file_dir = write_excell(dnalist,chrnum,pos)
- modify_excell(out_file_dir,chrnum,pos)
- main()
结果如下:
- http://genome.ucsc.edu/cgi-bin/das/hg19/dna?segment=chr17:7675591,7676591
- 111111
- get 222222222222222
- match ok
BeautifulSoup tag 属性 获取 dna 标签属性的字符串部分
- final======> ['SEQUENCE id="chr17"start="7675591"stop="7676591"version="1.00"','DNA length="1001"', 'CCCAAGAGCCTTCAGTATACACATCAATAAAAATAATTTTAATTATTCTGATAAAAGATAAACATGAAAAGTTATGGTATGCAAAGTTGAATGACAACAACTGATACTATTTGAAATAATTGACAGAATTATATTCCGTAACAATTTATAAGCAAAGCCAAAAAAACAATGATCCCTTTGTTGAATGCACAGAACAAATCCATCTTGTCCACGGCTACTGAGCATGCCTGTGATCTCCAGGGGTCACTCAGGTTTGACTCAAAGGATCCAACAGCCTGTAGACCCTGTGCTTGAAGGCATGAGGGTCACCTCTGAGTTCACACTCACTAGTGTCCCTCCTTTCTTCAGAAAGCTAGGAACTGGGAAGACAAGGGGAAAATCAATCAAGGCCTGAGGTATGGGGCTGTAGGCTGGGAGGAAACTAACATTATTGAGAAGCTACTGATGTGAATACATTTCAATTACTACTCACATTGGTTTTTTGTTTGTTTGTTTGTTTGTTTGTTTGTTTGTTTTTTAAGACGGAGTTTTGCTCTCGTTGCCCAGGCTGGAGTGCAATGGAATGATCTAAGGTCACCACAACCTCCACCTCCCGGTTCAAGCAATTCTCCTGCCTCAGCCTCCCAAGTAGCTGGGACTACAGGCGTGTGCCACCACACCCAGCTAAGTTTGTATTTTTTTAGTAGAGACGGTGTTTCACCATGTTGGTCAGGCTGGTCTCGAACTCCTGACCTCAAGTGATCCACCCACCTCGGCCTCCCAAAGTGCTGGGATTATAGGCATGAGCCACCACACCCAGCCTCACGTTGGTTTTTGAGATGGATTTTATTGCCATTTTGTACACAAAAAGGTCAAAACTCAGTGAGGTGAATTGACATGACAGTAAGTGAAAGAACTACTATCTGATTGGGGGTCTTCTGCCGCCTGCTCTGGGACTCTTTCTGCTATGACATGAAGGACATTGGCAACCCCAGTCCTTGCAGATTTCTTTCACTGTGTGC']
- ===== CCCAAGAGCCTTCAGTATACACATCAATAAAAATAATTTTAATTATTCTGATAAAAGATAAACATGAAAAGTTATGGTATGCAAAGTTGAATGACAACAACTGATACTATTTGAAATAATTGACAGAATTATATTCCGTAACAATTTATAAGCAAAGCCAAAAAAACAATGATCCCTTTGTTGAATGCACAGAACAAATCCATCTTGTCCACGGCTACTGAGCATGCCTGTGATCTCCAGGGGTCACTCAGGTTTGACTCAAAGGATCCAACAGCCTGTAGACCCTGTGCTTGAAGGCATGAGGGTCACCTCTGAGTTCACACTCACTAGTGTCCCTCCTTTCTTCAGAAAGCTAGGAACTGGGAAGACAAGGGGAAAATCAATCAAGGCCTGAGGTATGGGGCTGTAGGCTGGGAGGAAACTAACATTATTGAGAAGCTACTGATGTGAATACATTTCAATTACTACTCACATTGGTTTTTTGTTTGTTTGTTTGTTTGTTTGTTTGTTTGTTTTTTAAGACGGAGTTTTGCTCTCGTTGCCCAGGCTGGAGTGCAATGGAATGATCTAAGGTCACCACAACCTCCACCTCCCGGTTCAAGCAATTCTCCTGCCTCAGCCTCCCAAGTAGCTGGGACTACAGGCGTGTGCCACCACACCCAGCTAAGTTTGTATTTTTTTAGTAGAGACGGTGTTTCACCATGTTGGTCAGGCTGGTCTCGAACTCCTGACCTCAAGTGATCCACCCACCTCGGCCTCCCAAAGTGCTGGGATTATAGGCATGAGCCACCACACCCAGCCTCACGTTGGTTTTTGAGATGGATTTTATTGCCATTTTGTACACAAAAAGGTCAAAACTCAGTGAGGTGAATTGACATGACAGTAAGTGAAAGAACTACTATCTGATTGGGGGTCTTCTGCCGCCTGCTCTGGGACTCTTTCTGCTATGACATGAAGGACATTGGCAACCCCAGTCCTTGCAGATTTCTTTCACTGTGTGC <class 'str'>
来源: http://www.bubuko.com/infodetail-2905317.html