Source code for pcasuite.pcz_similarity

#!/usr/bin/env python3

"""Module containing the PCZsimilarity class and the command line interface."""
import argparse
import json
import numpy as np
import shutil
from pathlib import PurePath
from biobb_common.tools import file_utils as fu
from math import exp
from biobb_common.generic.biobb_object import BiobbObject
from biobb_common.configuration import settings
from biobb_common.tools.file_utils import launchlogger


[docs]class PCZsimilarity(BiobbObject): """ | biobb_flexserv PCZsimilarity | Compute PCA similarity between two given compressed PCZ files. | Wrapper of the pczdump tool from the PCAsuite FlexServ module. Args: input_pcz_path1 (str): Input compressed trajectory file 1. File type: input. `Sample file <https://github.com/bioexcel/biobb_flexserv/raw/master/biobb_flexserv/test/data/pcasuite/pcazip.pcz>`_. Accepted formats: pcz (edam:format_3874). input_pcz_path2 (str): Input compressed trajectory file 2. File type: input. `Sample file <https://github.com/bioexcel/biobb_flexserv/raw/master/biobb_flexserv/test/data/pcasuite/pcazip.pcz>`_. Accepted formats: pcz (edam:format_3874). output_json_path (str): Output json file with PCA Similarity results. File type: output. `Sample file <https://github.com/bioexcel/biobb_flexserv/raw/master/biobb_flexserv/test/reference/pcasuite/pcz_similarity.json>`_. Accepted formats: json (edam:format_3464). properties (dict - Python dictionary object containing the tool parameters, not input/output files): * **amplifying_factor** (*float*) - ("0.0") common displacement (dx) along the different eigenvectors. If 0, the result is the absolute similarity index (dot product). * **binary_path** (*str*) - ("pczdump") pczdump binary path to be used. * **remove_tmp** (*bool*) - (True) [WF property] Remove temporal files. * **restart** (*bool*) - (False) [WF property] Do not execute if output files exist. Examples: This is a use example of how to use the building block from Python:: from biobb_flexserv.pcasuite.pcz_similarity import pcz_similarity pcz_similarity( input_pcz_path1='/path/to/pcazip_input1.pcz', input_pcz_path2='/path/to/pcazip_input2.pcz', output_json_path='/path/to/pcz_similarity.json', properties=prop) Info: * wrapped_software: * name: FlexServ PCAsuite * version: >=1.0 * license: Apache-2.0 * ontology: * name: EDAM * schema: http://edamontology.org/EDAM.owl """ def __init__(self, input_pcz_path1: str, input_pcz_path2: str, output_json_path: str, properties: dict = None, **kwargs) -> None: properties = properties or {} # Call parent class constructor super().__init__(properties) self.locals_var_dict = locals().copy() # Input/Output files self.io_dict = { 'in': {'input_pcz_path1': input_pcz_path1, 'input_pcz_path2': input_pcz_path2}, 'out': {'output_json_path': output_json_path} } # Properties specific for BB self.properties = properties self.amplifying_factor = properties.get('amplifying_factor') self.binary_path = properties.get('binary_path', 'pczdump') # Check the properties self.check_properties(properties) self.check_arguments() # Check two eigenvectors to be compatible for dot product # i.e. same number of vectors and values per vector
[docs] def are_compatible(self, eigenvectors_1, eigenvectors_2): # Check the number of eigenvectors and the number of values in both eigenvectors to match if len(eigenvectors_1) != len(eigenvectors_2): print('WARNING: Number of eigenvectors does not match') return False if len(eigenvectors_1[0]) != len(eigenvectors_2[0]): print('WARNING: Number of values in eigenvectors does not match') return False return True
# Weighted Cross Product (WCP). # Get the weighted cross product between eigenvectors # This is meant to compare PCA results for molecular dynamics structural conformations # The number of eigenvectors to be compared may be specified. All (0) by default # DISCLAIMER: This code has been translated from a perl script signed by Alberto Perez (13/09/04) # Exploring the Essential Dynamics of B-DNA; Alberto Perez, Jose Ramon Blas, Manuel Rueda, # Jose Maria Lopez-Bes, Xavier de la Cruz and Modesto Orozco. J. Chem. Theory Comput.2005,1.
[docs] def get_similarity_index(self, eigenvalues_1, eigenvectors_1, eigenvalues_2, eigenvectors_2, dx=None): # Check the number of eigenvectors and the number of values in both eigenvectors to match if not self.are_compatible(eigenvectors_1, eigenvectors_2): raise SystemExit('Eigenvectors are not compatible') # Find out the total number of eigenvectors # Set the number of eigenvectors to be analyzed in case it is not set or it exceeds the total eigenvectors_number = min(len(eigenvectors_1), len(eigenvectors_2)) # Find out the number of atoms in the structure # Eigenvectors are atom coordinates and each atom has 3 coordinates (x, y, z) if len(eigenvectors_1[0]) % 3 != 0: raise SystemExit('Something is wrong with eigenvectors since number of values is not divisor of 3') atom_number = int(len(eigenvectors_1[0]) / 3) # Amplifying factor: if it is 0 the algorithm is the same as a simple dot product. # The value of the ~10th eigenvalue is usually taken. if dx is not None: amplifying_factor = dx else: amplifying_factor = eigenvalues_1[eigenvectors_number-1] # Get the denominator # Find new denominator # for ($i=0;$i<$nvec;$i++){ # $cte1+=exp(-1/$val_1[$i]*$ampf); # $cte2+=exp(-1/$val_2[$i]*$ampf); # $part1+=exp(-2/$val_1[$i]*$ampf)*exp(-2/$val_1[$i]*$ampf); # $part2+=exp(-2/$val_2[$i]*$ampf)*exp(-2/$val_2[$i]*$ampf); # } cte1 = part1 = cte2 = part2 = 0 for eigenvalue in eigenvalues_1: cte1 += exp(-1 / eigenvalue * amplifying_factor) part1 += exp(-2 / eigenvalue * amplifying_factor) ** 2 for eigenvalue in eigenvalues_2: cte2 += exp(-1 / eigenvalue * amplifying_factor) part2 += exp(-2 / eigenvalue * amplifying_factor) ** 2 denominator = part1 * cte2 * cte2 / cte1 / cte1 + part2 * cte1 * cte1 / cte2 / cte2 # Get all eigenvector values together eigenvector_values_1 = [v for ev in eigenvectors_1 for v in ev] eigenvector_values_2 = [v for ev in eigenvectors_2 for v in ev] # IDK what it is doing now total_summatory = 0 for i in range(eigenvectors_number): for j in range(eigenvectors_number): # Array has vectors in increasing order of vap, get last one first a = (eigenvectors_number - 1 - i) * atom_number * 3 b = (eigenvectors_number - i) * atom_number * 3 - 1 c = (eigenvectors_number - 1 - j) * atom_number * 3 d = (eigenvectors_number - j) * atom_number * 3 - 1 temp1 = eigenvector_values_1[a:b] temp2 = eigenvector_values_2[c:d] if len(temp1) != len(temp2): raise ValueError("Projection of vectors of different size!!") # Project the two vectors add = 0 for k, value_1 in enumerate(temp1): value_2 = temp2[k] add += value_1 * value_2 add = add * exp(-1 / eigenvalues_1[i] * amplifying_factor - 1 / eigenvalues_2[j] * amplifying_factor) add2 = add ** 2 total_summatory += add2 similarity_index = total_summatory * 2 / denominator return similarity_index
# Weighted Cross Product (WCP). # DF implementation of Alberto's formula # Exploring the Essential Dynamics of B-DNA; Alberto Perez, Jose Ramon Blas, Manuel Rueda, # Jose Maria Lopez-Bes, Xavier de la Cruz and Modesto Orozco. J. Chem. Theory Comput.2005,1.
[docs] def eigenmsip(self, eigenvalues_1, eigenvectors_1, eigenvalues_2, eigenvectors_2, dx=None): evals1 = np.array(eigenvalues_1) evals2 = np.array(eigenvalues_2) evecs1 = np.array(eigenvectors_1) evecs2 = np.array(eigenvectors_2) n_components = len(eigenvectors_1) # Amplifying factor: if it is 0 the algorithm is the same as a simple dot product. # The value of the ~10th eigenvalue is usually taken. if dx is not None: amplifying_factor = dx else: amplifying_factor = evals1[n_components-1] e1 = np.exp(-(amplifying_factor)**2/evals1) e2 = np.exp(-(amplifying_factor)**2/evals2) e1_2 = np.exp(-2*(amplifying_factor)**2/evals1) e2_2 = np.exp(-2*(amplifying_factor)**2/evals2) sume1 = np.sum(e1) sume2 = np.sum(e2) denominator = np.sum((e1_2/sume1**2)**2)+np.sum((e2_2/sume2**2)**2) # numerator_df = np.square(np.dot(evecs1, evecs2)*np.outer(e1, e2)/(sume1*sume2)) # numerator_df = 2 * np.sum(numerator_df) val_tmp = 0 accum_a = 0 c = sume1*sume2 for pc in range(0, n_components): for pc2 in range(0, n_components): eve1 = evecs1[pc] eve2 = evecs2[pc2] eva1 = evals1[pc] eva2 = evals2[pc2] a = np.dot(eve1, eve2) b = np.exp(-(amplifying_factor)**2/eva1 - (amplifying_factor)**2/eva2) val_tmp = val_tmp + ((a * b)/c)**2 accum_a = accum_a + a numerator = 2 * val_tmp return numerator/(denominator)
# Get the dot product matrix of two eigenvectors
[docs] def dot_product(self, eigenvectors_1, eigenvectors_2): # Check the number of eigenvectors and the number of values in both eigenvectors to match if not self.are_compatible(eigenvectors_1, eigenvectors_2): raise SystemExit('Eigenvectors are not compatible') # Get the dot product dpm = np.dot(eigenvectors_1, np.transpose(eigenvectors_2)) return dpm
# Get the dot product matrix of two eigenvectors (squared and normalized). # Absolute Similarity Index (Hess 2000, 2002)
[docs] def dot_product_accum(self, eigenvectors_1, eigenvectors_2): n_components = len(eigenvectors_1) # Get the dot product dpm = self.dot_product(eigenvectors_1, eigenvectors_2) sso = (dpm * dpm).sum() / n_components # sso = (dpm * dpm).sum() / n_components return sso
# Get the subspace overlap # Same as before, but with a square root
[docs] def get_subspace_overlap(self, eigenvectors_1, eigenvectors_2): # Get the number of eigenvectors n_components = len(eigenvectors_1) # Get the dot product dpm = self.dot_product(eigenvectors_1, eigenvectors_2) sso = np.sqrt((dpm * dpm).sum() / n_components) # sso = (dpm * dpm).sum() / n_components return sso
# Classic RMSip (Root Mean Square Inner Product), gives the same results as the previous function get_subspace_overlap
[docs] def get_rmsip(self, eigenvectors_1, eigenvectors_2): # Get the number of eigenvectors n_components = len(eigenvectors_1) accum = 0 for pc in range(0, n_components): for pc2 in range(0, n_components): dpm = np.dot(eigenvectors_1[pc], eigenvectors_2[pc2]) val = dpm * dpm accum = accum + val sso = np.sqrt(accum / n_components) # sso = (dpm * dpm).sum() / n_components return sso
# RWSIP, Root Weighted Square Inner Product. Same as before, but weighted using the eigen values. # See Edvin Fuglebakk and others, Measuring and comparing structural fluctuation patterns in large protein datasets, # Bioinformatics, Volume 28, Issue 19, October 2012, Pages 2431–2440, https://doi.org/10.1093/bioinformatics/bts445
[docs] def get_rwsip(self, eigenvalues_1, eigenvectors_1, eigenvalues_2, eigenvectors_2): # Get the number of eigenvectors n_components = len(eigenvectors_1) accum = 0 norm = 0 for pc in range(0, n_components): for pc2 in range(0, n_components): dpm = np.dot(eigenvectors_1[pc], eigenvectors_2[pc2]) val = dpm * dpm * eigenvalues_1[pc] * eigenvalues_2[pc2] accum = accum + val norm = norm + eigenvalues_1[pc] * eigenvalues_2[pc] sso = np.sqrt(accum / norm) # sso = (dpm * dpm).sum() / n_components return sso
[docs] @launchlogger def launch(self): """Launches the execution of the FlexServ pcz_similarity module.""" # Setup Biobb if self.check_restart(): return 0 # self.stage_files() # Internal file paths # try: # # Using rel paths to shorten the amount of characters due to fortran path length limitations # input_pcz_1 = str(Path(self.stage_io_dict["in"]["input_pcz_path1"]).relative_to(Path.cwd())) # input_pcz_2 = str(Path(self.stage_io_dict["in"]["input_pcz_path2"]).relative_to(Path.cwd())) # output_json = str(Path(self.stage_io_dict["out"]["output_json_path"]).relative_to(Path.cwd())) # except ValueError: # # Container or remote case # output_json = self.stage_io_dict["out"]["output_json_path"] # Manually creating a Sandbox to avoid issues with input parameters buffer overflow: # Long strings defining a file path makes Fortran or C compiled programs crash if the string # declared is shorter than the input parameter path (string) length. # Generating a temporary folder and working inside this folder (sandbox) fixes this problem. # The problem was found in Galaxy executions, launching Singularity containers (May 2023). # Creating temporary folder self.tmp_folder = fu.create_unique_dir() fu.log('Creating %s temporary folder' % self.tmp_folder, self.out_log) shutil.copy2(self.io_dict["in"]["input_pcz_path1"], self.tmp_folder) shutil.copy2(self.io_dict["in"]["input_pcz_path2"], self.tmp_folder) # Temporary output temp_json = "output.json" temp_out_1 = "output1.dat" temp_out_2 = "output2.dat" # Command line 1 # pczdump -i structure.ca.std.pcz --evals -o evals.txt # self.cmd = [self.binary_path, # Evals pcz 1 # "-i", input_pcz_1, # "-o", temp_out_1, # "--evals", ';', # self.binary_path, # Evals pcz 2 # "-i", input_pcz_2, # "-o", temp_out_2, # "--evals" # ] self.cmd = ['cd', self.tmp_folder, ';', self.binary_path, # Evals pcz 1 "-i", PurePath(self.io_dict["in"]["input_pcz_path1"]).name, "-o", temp_out_1, "--evals", ';', self.binary_path, # Evals pcz 2 "-i", PurePath(self.io_dict["in"]["input_pcz_path2"]).name, "-o", temp_out_2, "--evals" ] # Run Biobb block 1 self.run_biobb() # Parse output evals info_dict = {} info_dict['evals_1'] = [] info_dict['evals_2'] = [] with open(PurePath(self.tmp_folder).joinpath(temp_out_1), 'r') as file: for line in file: info = float(line.strip()) info_dict['evals_1'].append(info) with open(PurePath(self.tmp_folder).joinpath(temp_out_2), 'r') as file: for line in file: info = float(line.strip()) info_dict['evals_2'].append(info) num_evals_1 = len(info_dict['evals_1']) num_evals_2 = len(info_dict['evals_2']) num_evals_min = min(num_evals_1, num_evals_2) info_dict['num_evals_min'] = num_evals_min # Command line 2 # pczdump -i structure.ca.std.pcz --evals -o evals.txt self.cmd = [] self.cmd.append('cd') self.cmd.append(self.tmp_folder) self.cmd.append(';') for pc in (range(1, num_evals_min+1)): # Evecs pcz 1 self.cmd.append(self.binary_path) self.cmd.append("-i") # self.cmd.append(input_pcz_1) self.cmd.append(PurePath(self.io_dict["in"]["input_pcz_path1"]).name) # self.cmd.append("-o evecs_1_pc{}".format(pc)) self.cmd.append("-o") # self.cmd.append(str(Path(self.stage_io_dict.get("unique_dir")).joinpath("evecs_1_pc{}".format(pc)))) self.cmd.append("evecs_1_pc{}".format(pc)) self.cmd.append("--evec={}".format(pc)) self.cmd.append(";") # Evals pcz 2 self.cmd.append(self.binary_path) self.cmd.append("-i") # self.cmd.append(input_pcz_2) self.cmd.append(PurePath(self.io_dict["in"]["input_pcz_path2"]).name) # self.cmd.append("-o evecs_2_pc{}".format(pc)) self.cmd.append("-o") # self.cmd.append(str(Path(self.stage_io_dict.get("unique_dir")).joinpath("evecs_2_pc{}".format(pc)))) self.cmd.append("evecs_2_pc{}".format(pc)) self.cmd.append("--evec={}".format(pc)) self.cmd.append(";") # Run Biobb block 2 self.run_biobb() # Parse output evecs info_dict['evecs_1'] = {} info_dict['evecs_2'] = {} eigenvectors_1 = [] eigenvectors_2 = [] for pc in (range(1, num_evals_min+1)): pc_id = "pc{}".format(pc) info_dict['evecs_1'][pc_id] = [] info_dict['evecs_2'][pc_id] = [] # with open(str(Path(self.stage_io_dict.get("unique_dir")).joinpath("evecs_1_pc{}".format(pc))), 'r') as file: with open(PurePath(self.tmp_folder).joinpath("evecs_1_pc{}".format(pc)), 'r') as file: list_evecs = [] for line in file: info = line.strip().split(' ') for nums in info: if nums: list_evecs.append(float(nums)) info_dict['evecs_1'][pc_id] = list_evecs eigenvectors_1.append(list_evecs) # with open(str(Path(self.stage_io_dict.get("unique_dir")).joinpath("evecs_2_pc{}".format(pc))), 'r') as file: with open(PurePath(self.tmp_folder).joinpath("evecs_2_pc{}".format(pc)), 'r') as file: list_evecs = [] for line in file: info = line.strip().split(' ') for nums in info: if nums: list_evecs.append(float(nums)) info_dict['evecs_2'][pc_id] = list_evecs eigenvectors_2.append(list_evecs) # simIndex = self.get_similarity_index(info_dict['evals_1'], eigenvectors_1, info_dict['evals_2'], eigenvectors_2, self.amplifying_factor) # info_dict['similarityIndex_WCP2'] = float("{:.3f}".format(simIndex)) # dotProduct = self.get_subspace_overlap(eigenvectors_1, eigenvectors_2) # info_dict['similarityIndex_rmsip2'] = float("{:.3f}".format(dotProduct)) eigenmsip = self.eigenmsip(info_dict['evals_1'], eigenvectors_1, info_dict['evals_2'], eigenvectors_2, self.amplifying_factor) info_dict['similarityIndex_WCP'] = float("{:.3f}".format(eigenmsip)) rmsip = self.get_rmsip(eigenvectors_1, eigenvectors_2) info_dict['similarityIndex_rmsip'] = float("{:.3f}".format(rmsip)) rwsip = self.get_rwsip(info_dict['evals_1'], eigenvectors_1, info_dict['evals_2'], eigenvectors_2) info_dict['similarityIndex_rwsip'] = float("{:.3f}".format(rwsip)) dotp = self.dot_product_accum(eigenvectors_1, eigenvectors_2) info_dict['similarityIndex_dotp'] = float("{:.3f}".format(dotp)) with open(PurePath(self.tmp_folder).joinpath(temp_json), 'w') as out_file: out_file.write(json.dumps(info_dict, indent=4)) # Copy outputs from temporary folder to output path shutil.copy2(PurePath(self.tmp_folder).joinpath(temp_json), PurePath(self.io_dict["out"]["output_json_path"])) # remove temporary folder(s) self.tmp_files.extend([ self.tmp_folder ]) self.remove_tmp_files() self.check_arguments(output_files_created=True, raise_exception=False) return self.return_code
[docs]def pcz_similarity(input_pcz_path1: str, input_pcz_path2: str, output_json_path: str, properties: dict = None, **kwargs) -> int: """Create :class:`PCZsimilarity <flexserv.pcasuite.pcz_similarity>`flexserv.pcasuite.PCZsimilarity class and execute :meth:`launch() <flexserv.pcasuite.pcz_similarity.launch>` method""" return PCZsimilarity(input_pcz_path1=input_pcz_path1, input_pcz_path2=input_pcz_path2, output_json_path=output_json_path, properties=properties).launch()
[docs]def main(): parser = argparse.ArgumentParser(description='Compute PCA Similarity from a given pair of compressed PCZ files.', formatter_class=lambda prog: argparse.RawTextHelpFormatter(prog, width=99999)) parser.add_argument('--config', required=False, help='Configuration file') # Specific args required_args = parser.add_argument_group('required arguments') required_args.add_argument('--input_pcz_path1', required=True, help='Input compressed trajectory file 1. Accepted formats: pcz.') required_args.add_argument('--input_pcz_path2', required=True, help='Input compressed trajectory file 2. Accepted formats: pcz.') required_args.add_argument('--output_json_path', required=True, help='Output json file with PCA similarity. Accepted formats: json.') args = parser.parse_args() args.config = args.config or "{}" properties = settings.ConfReader(config=args.config).get_prop_dic() # Specific call pcz_similarity(input_pcz_path1=args.input_pcz_path1, input_pcz_path2=args.input_pcz_path2, output_json_path=args.output_json_path, properties=properties)
if __name__ == '__main__': main()