python解析.hhr文件_IT分享知识网

大家好，欢迎来到IT知识分享网。

“HHR” 文件格式通常与HMMER（Hidden Markov Model for Molecular Evolution）软件相关联，该软件用于执行隐马尔可夫模型的序列比对和搜索。HMMER用于生物信息学中的蛋白质和核酸序列比对任务，例如蛋白质家族分类、序列相似性搜索等。HHR 文件是HMMER比对结果的一种输出格式，它包含了与序列比对相关的信息。

hhr文件格式介绍

import dataclasses import re from typing import List, Optional, Sequence @dataclasses.dataclass(frozen=True) class TemplateHit: """Class representing a template hit.""" index: int name: str aligned_cols: int sum_probs: Optional[float] query: str hit_sequence: str indices_query: List[int] indices_hit: List[int] def _get_hhr_line_regex_groups( regex_pattern: str, line: str) -> Sequence[Optional[str]]: match = re.match(regex_pattern, line) if match is None: raise RuntimeError(f'Could not parse query line {line}') return match.groups() def _update_hhr_residue_indices_list( sequence: str, start_index: int, indices_list: List[int]): """Computes the relative indices for each residue with respect to the original sequence.""" counter = start_index for symbol in sequence: if symbol == '-': indices_list.append(-1) else: indices_list.append(counter) counter += 1 def _parse_hhr_hit(detailed_lines: Sequence[str]) -> TemplateHit: """Parses the detailed HMM HMM comparison section for a single Hit. This works on .hhr files generated from both HHBlits and HHSearch. Args: detailed_lines: A list of lines from a single comparison section between 2 sequences (which each have their own HMM's) Returns: A dictionary with the information from that detailed comparison section Raises: RuntimeError: If a certain line cannot be processed """ # Parse first 2 lines. number_of_hit = int(detailed_lines[0].split()[-1]) name_hit = detailed_lines[1][1:] # Parse the summary line. #pattern = ( # 'Probab=(.*)[\t ]*E-value=(.*)[\t ]*Score=(.*)[\t ]*Aligned_cols=(.*)[\t' # ' ]*Identities=(.*)%[\t ]*Similarity=(.*)[\t ]*Sum_probs=(.*)[\t ' # ']*Template_Neff=(.*)') # "()":标记一个子表达式的开始和结束位置. # '.':匹配除换行符 \n 之外的任何单字符 # '*': 匹配前面的子表达式零次或多次。 # '[]':匹配其中列举的字符，可以是很多单个，也可以范围 # '\t': 换行符 pattern = ( 'Probab=(.*)[\t ]*E-value=(.*)[\t ]*Score=(.*)[\t ]*Aligned_cols=(.*)[\t' ' ]*Identities=(.*)%[\t ]*Similarity=(.*)[\t ]*Sum_probs=(.*)') # Probab=100.00 E-value=1.5e-142 Score=1017.19 Aligned_cols=366 Identities=56% Similarity=1.021 Sum_probs=364.0 match = re.match(pattern, detailed_lines[2]) # 示例detailed_lines[2]：Probab=91.65 E-value=0.12 Score=40.00 Aligned_cols=62 Identities=16% Similarity=0.149 Sum_probs=42.0 # print(detailed_lines[2]) # print(match) if match is None: raise RuntimeError( 'Could not parse section: %s. Expected this: \n%s to contain summary.' % (detailed_lines, detailed_lines[2])) (_, _, _, aligned_cols, _, _, sum_probs) = [float(x) for x in match.groups()] # The next section reads the detailed comparisons. These are in a 'human # readable' format which has a fixed length. The strategy employed is to # assume that each block starts with the query sequence line, and to parse # that with a regexp in order to deduce the fixed length used for that block. query = '' hit_sequence = '' indices_query = [] indices_hit = [] length_block = None for line in detailed_lines[3:]: # Parse the query sequence line if (line.startswith('Q ') and not line.startswith('Q ss_dssp') and not line.startswith('Q ss_pred') and not line.startswith('Q Consensus')): # Thus the first 17 characters must be 'Q <query_name> ', and we can parse # everything after that. # start sequence end total_sequence_length patt = r'[\t ]*([0-9]*) ([A-Z-]*)[\t ]*([0-9]*) \([0-9]*\)' # pattern示例：306 TATYDFKMADLVAPEATVRRFLQGRRNLAKVCALLRGYLLPGAPADL 355 (431) groups = _get_hhr_line_regex_groups(patt, line[17:]) #print("groups") #print(groups) # Get the length of the parsed block using the start and finish indices, # and ensure it is the same as the actual block length. start = int(groups[0]) - 1 # Make index zero based. delta_query = groups[1] end = int(groups[2]) num_insertions = len([x for x in delta_query if x == '-']) length_block = end - start + num_insertions assert length_block == len(delta_query) # Update the query sequence and indices list. query += delta_query _update_hhr_residue_indices_list(delta_query, start, indices_query) elif line.startswith('T '): # Parse the hit sequence. if (not line.startswith('T ss_dssp') and not line.startswith('T ss_pred') and not line.startswith('T Consensus')): # Thus the first 17 characters must be 'T <hit_name> ', and we can # parse everything after that. # start sequence end total_sequence_length patt = r'[\t ]*([0-9]*) ([A-Z-]*)[\t ]*[0-9]* \([0-9]*\)' groups = _get_hhr_line_regex_groups(patt, line[17:]) start = int(groups[0]) - 1 # Make index zero based. delta_hit_sequence = groups[1] assert length_block == len(delta_hit_sequence) # Update the hit sequence and indices list. hit_sequence += delta_hit_sequence _update_hhr_residue_indices_list(delta_hit_sequence, start, indices_hit) return TemplateHit( index=number_of_hit, name=name_hit, aligned_cols=int(aligned_cols), sum_probs=sum_probs, query=query, hit_sequence=hit_sequence, indices_query=indices_query, indices_hit=indices_hit, ) def parse_hhr(hhr_string: str) -> Sequence[TemplateHit]: """Parses the content of an entire HHR file.""" lines = hhr_string.splitlines() # Each .hhr file starts with a results table, then has a sequence of hit # "paragraphs", each paragraph starting with a line 'No <hit number>'. We # iterate through each paragraph to parse each hit. block_starts = [i for i, line in enumerate(lines) if line.startswith('No ')] hits = [] if block_starts: block_starts.append(len(lines)) # Add the end of the final block. for i in range(len(block_starts) - 1): hits.append(_parse_hhr_hit(lines[block_starts[i]:block_starts[i + 1]])) return hits query.hhr文件路径 path/to/hh-suite/data/query.hhr with open("query.hhr") as f: hhr_str = f.read() hits = parse_hhr(hhr_str) print(len(hits)) print(hits)

参考：

python正则表达式

免责声明：本站所有文章内容,图片，视频等均是来源于用户投稿和互联网及文摘转载整编而成，不代表本站观点，不承担相关法律责任。其著作权各归其原作者或其出版社所有。如发现本站有涉嫌抄袭侵权/违法违规的内容,侵犯到您的权益，请在线联系站长,一经查实,本站将立刻删除。本文来自网络,若有侵权，请联系删除，如若转载，请注明出处：https://haidsoft.com/142871.html

python解析.hhr文件

相关推荐

发表回复