Source code for ScoringFunctions

## TEMPY is a Python library designed to help the user manipulate and analyse atomic structures and density maps from 3D EM. 
## Copyright (c) 2013 Daven Vasishtan,Irene Farabella, Arun Prasad Pandurangan, Harpal Sahota, Frank Alber and Maya Topf


from EMMap import *
from ProtRep_Biopy import *

#from ProtRep_ire3 import *
from PDBParser import  *
from StructureBlurrer import *
#import math
from scipy.spatial import KDTree
from scipy.signal import resample
#import itertools
from MapParser import *
#import numpy as np
#from scipy import *

[docs]class ScoringFunctions: """ A class containing variouse Scoring Function for density fitting. Vasishtan and Topf (2011) Scoring functions for cryoEM density fitting. J Struct Biol 174:333-343. """ def __init__(self): pass
[docs] def mapComparison(self, map1, map2): """ Compare the sampling rate, box size and origin of two maps. Returns True if they are the same, False otherwise. Arguments: *map1, map2* EMMap instances to compare. """ if (map1.apix - map2.apix < 1E-6) and map1.box_size() == map2.box_size() and map1.origin == map2.origin: return True else: return False
def _matchMaps(self, map1, map2): ## NOTE: CAN'T WORK, NEEDS A WAY OF SCORING DIFFERENT MAPS ## he tried to resample 1 map on to the other (and the the 2 on 1) #m1 = map1.copy() #m2 = map2.copy() #if m1.apix == m2.apix: # ex_origin_shift = (0,0,0) #elif m1.apix > m2.apix: # m2 = m2.resample_by_apix(m1.apix) # #ex_origin_shift = m1.centre()-m2.centre() # m2 = m2.resize_map(m1.box_size()) #else: # m1 = m1.resample_by_apix(m2.apix) # #ex_origin_shift = m2.centre()-m1.centre() # m1 = m1.resize_map(m2.box_size()) ##print m1 ##print m2 #if m1.box_size() != m2.box_size(): # m2 = m2.resize_map(m1.box_size()) #origin_shift = [y-x for x,y in zip(m2.origin, m1.origin)] #m2 = m2.shift_map(origin_shift)#shift_map #m2.origin = m1.origin[:] ##return m1, m2 return "Warning: can't match the map at the moment, use map with same box size." #comment all out!
[docs] def CCF(self, map1, map2): """ Calculate cross-correlation between two Map instances. Arguments: *map1, map2* EMMap instances to compare. """ if self.mapComparison(map1, map2): return (map1.normalise().getMap()*map2.normalise().getMap()).mean() else: return "can't Match the map" #m1,m2 = self.matchMaps(map1, map2) #return (m1.normalise().getMap()*m2.normalise().getMap()).mean()
[docs] def LSF(self, map1, map2): """ Calculate least-squares between two Map instances. Arguments: *map1, map2* EMMap instances to compare. """ if self.mapComparison(map1, map2): return ((map1.getMap()-map2.getMap())**2).mean() else: # m1,m2 = self.matchMaps(map1, map2) # return ((m1.getMap()-m2.getMap())**2).mean() return "can't Match the map"
[docs] def laplace_CCF(self, map1, map2, prefil=(False, False)): """ Calculate Laplacian cross-correlation between two Map instances. Based on (Chacon and Wriggers, 2002). Arguments: *map1, map2* EMMap instances to compare. *prefil* Tuple of boolean values, one for each map respectively. True if Map instance is already Laplacian-filtered. False otherwise. """ if not prefil[0]: map1 = map1.laplace_filtered() if not prefil[1]: map2 = map2.laplace_filtered() map1 = map1.normalise() map2 = map2.normalise() return self.CCF(map1, map2)
[docs] def normal_vector_score(self, map1, map2, min_threshold, max_threshold): """ Calculate the Normal Vector Score between two Map instances. Based on 3SOM algorithm (Ceulemans and Russell, 2004) Arguments: *map1, map2* EMMap instances to compare. *min_threshold, max_threshold* need to run get_min_threshold and get_max_threshold """ scores = [] if not self.mapComparison(map1, map2): #map1, map2 = self.matchMaps(map1, map2) return "can't Match the map" points = map1.get_pos(min_threshold, max_threshold) for v in points: n_vec = map1.get_normal_vector(v[0],v[1],v[2]) o_vec = map2.get_normal_vector(v[0],v[1],v[2]) try: #print n_vec, o_vec, n_vec.arg(o_vec) scores.append(abs(n_vec.arg(o_vec))) except ValueError: print 'Error: Angle between '+ str(n_vec) +', '+ str(o_vec) +' for point %d, %d, %d cannot be calculated.' %(v.x,v.y,v.z) # return if max(scores) == min(scores): return 0 else: return -(sum(scores)/len(points))
[docs] def get_partial_DLSF(self, x, map1, map2): """ Calculate the DLSF score between two Map instances. Arguments: *map1, map2* EMMap instances to compare. *x* number of significant points. """ map1_sig_pairs=map1._get_random_significant_pairs(int(x)) otherMap=map2 score = 0.0 for p in map1_sig_pairs: z1 = p[0] y1 = p[1] x1 = p[2] z2 = p[3] y2 = p[4] x2 = p[5] dens = p[6] prot_dens = otherMap.fullMap[z1][y1][x1] - otherMap.fullMap[z2][y2][x2] score += (dens-prot_dens)**2 return score/map1.fullMap.size
[docs] def MI(self, map1, map2, layers=20): """ Calculate the Mutual information Score between two Map instances. Arguments: *map1, map2* EMMap instances to compare. *layers* value for which to bin the map into a limited number of values. Default is 20 as in Shatsky et al., 2008. """ if self.mapComparison(map1, map2): m1, m2 = map1, map2 else: return "Can't Match" #m1,m2 = self.matchMaps(map1, map2) score = 0 m1_levels = (m1.max()-m1.min())/layers m2_levels = (m2.max()-m2.min())/layers for x in range(layers): for y in range(layers): m1_level_map = (m1.getMap() >= m1.min()+(x*m1_levels))*(m1.getMap() <= m1.min()+((x+1)*m1_levels)) m2_level_map = (m2.getMap() >= m2.min()+(y*m2_levels))*(m2.getMap() <= m2.min()+((y+1)*m2_levels)) comb_level_map = m1_level_map*m2_level_map p_m1 = float(m1_level_map.sum())/m1_level_map.size p_m2 = float(m2_level_map.sum())/m2_level_map.size p_comb = float(comb_level_map.sum())/comb_level_map.size if p_comb == 0: mi_score = 0.0 else: #print p_comb, p_m1, p_m2, p_comb/(p_m1*p_m2), log(p_comb/(p_m1*p_m2),2) mi_score = p_comb*math.log(p_comb/(p_m1*p_m2), 2) score += mi_score return score
def _NMI(self, map1, map2, layers=20): """ Normalized Mutual Information from Martin et al. 2005. *not implemented yet* """ #call all normaliz as in #Martin LC, Gloor GB, Dunn SD, Wahl LM. Using information theory to search for co-evolving residues in proteins. Bioinformatics 2005 21(22):4116-4124. print "not implemented yet" def _APC_MI(): """ Average Product Correction Mutual information from Dunn et al. 2008. *not implemented yet* """ #Dunn SD, Wahl LM, Gloor GB. Mutual information without the influence of phylogeny or entropy dramatically improves residue contact prediction. Bioinformatics 2008 24(3):333-340. print "not implemented yet" def _hausdorff_list(self, min_threshold, max_threshold, kdtree, map2): """ This is for the chamdef distance def chamfer_distance, min max density value that define the surface of the protein Arguments: *kdtree* (there are 2 of them in numpy one Cbased on py-based, the latter is better, ctrl) this have to be one of the input. kdtree from map1 *min_threshold, max_threshold* need to run get_min_threshold and get_max_threshold for map2 NOTE: if you keep the kdtree as parametre out os less time consuming as building it takes time. """ points = map2.get_pos(min_threshold, max_threshold) return kdtree.query(points)[0] #kdtree give 2 list 0=distance 1=actual points
[docs] def chamfer_distance(self, map1, map2, min_threshold, max_threshold, kdtree=None): """ Calculate the Chamfer Distance Score between two Map instances. NOT RACCOMANDED. Arguments: *map1, map2* EMMap instances to compare. *min_threshold, max_threshold* need to run get_min_threshold and get_max_threshold for map2 *kdtree* If set True It is possible to choose between two option of kdtree in numpy the one that is py-based is a better choice. """ if self.mapComparison(map1, map2): m1, m2 = map1, map2 else: return "can't match" #m1,m2 = matchMaps(map1, map2) if kdtree: return self.hausdorff_list(min_threshold, max_threshold, kdtree, m2).mean() else: kdtree = m1.makeKDTree(min_threshold, max_threshold) #if you don't assine it wil be build one kdtree return self.hausdorff_list(min_threshold, max_threshold, kdtree, m2).mean()#mean distance to the nearest neighbour #MODIFIED by IF and DV #3-01-2014
[docs] def envelope_score(self,map, cutoff, structure,norm=False): """ Calculate the Envelope Score between a target Map and a Structure Instances. Arguments: *map* Target Map Instance. *cutoff* Calculated with min_threshold for map. *structure* Structure Instance to compare. """ binMap = map.make_bin_map(cutoff) max_score = float(-2*numsum(binMap.fullMap)) min_score = float(numsum(binMap.fullMap)-2*numsum(binMap.fullMap+1)) blurrer = StructureBlurrer() struct_binMap = blurrer.make_atom_overlay_map1(map, structure) grid = struct_binMap.get_pos(0.9,1.1) for x,y,z in grid: g = binMap[z][y][x] if g == -1: binMap[z][y][x] = 2 elif g == 0: binMap[z][y][x] = -2 #score=binMap.fullMap.sum() score = float(numsum(binMap.fullMap)) if norm: norm_score = float((score-min_score)/(max_score-min_score)) return norm_score else: return score #added by IF # 19-12-2013 #ORIGINAL form PAP
[docs] def SCCF(self,map1,resolution_densMap,sigma_map,struct,ssefile_name): """ Calculate Segment baed cross-correlation from Pandurangan et al. 2013,J Struct Biol. 2013 Dec 12 Based on the Local CCF (Roseman, 2000). It is a local CCF around a selection of atoms. Arguments: *map1* Target Map Instance. *resolution_densMap* Parameter need for Structure Blurrer. Resolution of the target map. *sigma_map* Parameter need for Structure Blurrer. The sigma value (multiplied by the resolution) that controls the width of the Gaussian. Default values is 0.356. Other values used : 0.187R corresponding with the Gaussian width of the Fourier transform falling to half the maximum at 1/resolution, as used in Situs (Wriggers et al, 1999); 0.356R corresponding to the Gaussian width at 1/e maximum height equaling the resolution, the default in Chimera (Petterson et al, 2004); 0.425R the fullwidth half maximum being equal to the resolution, as used by FlexEM (Topf et al, 2008); 0.5R the distance between the two inflection points being the same length as the resolution, an option in Chimera (Petterson et al, 2004); 1R where the sigma value simply equal to the resolution, as used by NMFF (Tama et al, 2004). *struct* Structure Instance to compare *ssefile_name* Rigid-body file alike in which the selection is specified. """ blurrer = StructureBlurrer() outline = "" ssefile = open(ssefile_name, 'rU') nsse = 0 #IRENE note: #here read the SSE as in RB file check with our def look @ RB_get() for line in ssefile: sselist = [] nsse += 1 tokens = line.split(' ') for i in range(len(tokens)/2): start = int(tokens[i*2]) end = int(tokens[i*2+1]) sselist.append([start,end]) print sselist #Combine SSEs into one structure sse_struct_list = struct.break_into_segments(sselist) sse_struct = struct.combine_SSE_structures(sse_struct_list) sim_map = blurrer.gaussian_blur(sse_struct, resolution_densMap, densMap=map1, sigma_coeff=sigma_map, normalise=True) #Create masking for the maps: mask all values less than or equal to the minimum density value of the simulated map of SSE #NOTE: maybe we should add it as a def mask #can be useful for futher implementation of envelope_score. minDens = 0.0 sim_min = sim_map.min() if sim_min >= 0.0: minDens = sim_min #Creat the mask array sim_mask_array = sim_map._get_maskArray(minDens) #Apply the mask to em and simulated maps mask_emMap = map1._get_maskMap(sim_mask_array) mask_simMap = sim_map._get_maskMap(sim_mask_array) #Calculate the SSE CCC sse_lccf = (mask_emMap * mask_simMap).sum()/sqrt(square(mask_emMap).sum()*square(mask_simMap).sum()) #return the overall score outline+='%s\n'%sse_lccf if nsse>1: #return score per element if more then one is specify outline+='LCCF for the SSE %3d: %7.3f\n'%(nsse,sse_lccf) ssefile.close() return outline ### ||| WARNING: ### ||| PRIVATE PARTS DOWN THERE ||| ### ### ||| EXPLICIT CONTENT ||| ### ### VVV VVV ### #=============================================================================== # #=============================================================================== # # # # #apix may be a issue ctrl # # #no idea what this does or is. apix # # #get_clash_map exists in EMMAP, in structure blured (defenetly is here and it is used!) already ctrl and may be a double if # # #it is different then the strctureblured one may be get rid of this. # # #============================================================================== # # # def get_clash_map(self,emmap, apix): # # # #look note in gaussian_blur function for make_clash_map DAVE NEED TO CTRL IT # # # template_grid = emmap.make_clash_map(apix) # # # return template_grid # # #============================================================================== # # # # #=================TO DEL FOR THE MOMENT============================================================== # # # # # # #this is PAP # # # #CTRL how it performs with a hight number of component. May be possible to think at an alternative way that speed up the calculation if slow # # # def get_overlay_comp_maplist(self, struct,template_grid): # # # #call blurue obj # # # blurrer = StructureBlurrer() # # # overlay_maplist = [] # # # #split structure in its component struct is a list of structure. It is an Assembly Obj # # # ssplit = struct.structList # # # #split_into_chains() # # # for x in ssplit: # # # overlay_maplist.append(blurrer.make_atom_overlay_map1(template_grid, x)) # # # return overlay_maplist # # # # # # ## this is PAP. # # # ## this function are related to the clash score # # # ## calculate the fraction of overlap. THIS IS THE "CLASH SCORE" # # # # it return the fraction of volume that overlap # # # def get_sm_score(self, struct, ncomp, template_grid, cvol, apix): # # # overlay_maplist = [] # # # overlay_maplist = self.get_overlay_comp_maplist(struct,template_grid) # # # nc = range(ncomp) # # # cpair = list(itertools.combinations(nc,2)) # # # #print cpair # # # score = 0.0 # # # for i in cpair: # # # #print 'sum c1, sum c2, c1*c2' # # # #print overlay_maplist[i[0]].fullMap.sum(), overlay_maplist[i[1]].fullMap.sum(), (overlay_maplist[i[0]].fullMap * overlay_maplist[i[1]].fullMap).sum() # # # score = score + ((overlay_maplist[i[0]].fullMap * overlay_maplist[i[1]].fullMap).sum()*(apix**3)) / ((cvol[i[0]]+cvol[i[1]])) # # # return -score # # #=============================================================================== # #=============================================================================== # # # #====================================NOT USED ANYWHERE======================================= # # def convert_bin_map(self, binMap): # # #???? # # # WHAT IS THIS?????? # # """For use with the binary score. Takes in a binary map and adds the binary map from this structure such that # # pixels under an atom with value: # # - -1 --> 2 # # - 0 --> -2 # # - 2 --> -5 # # The original binary map from a density map will have pixels with value -1 above a specific threshold, and 0 below. # # binMap = binary map to be modified.""" # # TotalScore = 0 # # grid = self.get_grid_positions(binMap) # # protGrid = grid[0] # # box_edge_pen = grid[1] # # for point in protGrid: # # g = binMap.fullMap[point[2]][point[1]][point[0]] # # if g == -1: # # binMap.fullMap[point[2]][point[1]][point[0]] = 3 # # elif g == 0: # # binMap.fullMap[point[2]][point[1]][point[0]] = -2 # # elif g == -5: # # pass # # elif g == 2: # # binMap.fullMap[point[2]][point[1]][point[0]] = -5 # # for x in range(binMap.x_size): # # for y in range(binMap.y_size): # # for z in range(binMap.z_size): # # if binMap[z][y][x] == 3: # # binMap[z][y][x] = 2 # # return (binMap, box_edge_pen) # #=========================================================================== # # #=============================================================================== # # WHAT IS THIS?????? # # def clash_bin_map(self, densMap): # # ### ???? what is this and why is here if is map related???? # # clashMap = densMap.make_clash_map() # # protGrid, box_edge_pen = self.getGridPositions(clashMap) # # for point in protGrid: # # clashMap.fullMap[point[2]][point[1]][point[0]] = 1 # # return clashMap.fullMap, box_edge_pen # #=============================================================================== #===============================================================================