Source code for aquaduct.geom.master

# -*- coding: utf-8 -*-

# Aqua-Duct, a tool facilitating analysis of the flow of solvent molecules in molecular dynamic simulations
# Copyright (C) 2016-2018  Tomasz Magdziarz, Alicja Płuciennik, Michał Stolarczyk <info@aquaduct.pl>
# Copyright (C) 2019  Tomasz Magdziarz <info@aquaduct.pl>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.

# this modlue is a prototype and has to be rewritten

from aquaduct import logger

import multiprocessing
from multiprocessing import Queue, Manager, Lock, Value, Process
from itertools import zip_longest
from functools import partial

import numpy as np
from scipy.spatial.distance import cdist, pdist

from aquaduct.traj.paths import GenericPathTypeCodes, GenericPaths, yield_single_paths, MasterPath
from aquaduct.utils.helpers import list_blocks_to_slices, strech_zip, zip_zip, xzip_xzip, concatenate
from aquaduct.utils import clui
from aquaduct.utils.maths import make_default_array, defaults
from aquaduct.traj.inlets import InletClusterGenericType, InletClusterExtendedType
from aquaduct.traj.paths import PassingPath
from aquaduct.apps.data import GCS

################################################################################
part2type_dict = {0: GenericPathTypeCodes.scope_name,
                  1: GenericPathTypeCodes.object_name,
                  2: GenericPathTypeCodes.scope_name}
'''
Part number to :class:`~aquaduct.traj.paths.GenericPathTypeCodes` dictionary.
'''

parts = (0, 1, 2)
'''
Parts enumerate.
'''

################################################################################

fsrs_cache = {}


[docs]class CTypeSpathsCollectionWorker(object): """ Worker class for averaging spaths in points of master path. """ def __init__(self, spaths=None, ctype=None, bias_long=5, smooth=None, lock=None): """ Core method for averaging spaths in to master path. Averaging is done in chunks. :param list spaths: List of separate paths to average. :param InletClusterGenericType ctype: CType of spaths. :param int bias_long: Bias towards long paths used in :meth:`lens_norm`. :param Smooth smooth: Smoothing method. """ self.spaths = spaths assert isinstance(ctype, InletClusterGenericType) or isinstance(ctype, InletClusterExtendedType) self.ctype = ctype self.bias_long = bias_long # TODO: check if it is required here self.smooth = smooth self.lens_cache = None self.lens_real_cache = None self.lens_norm_cache = None self.full_size_cache = None self.lock = lock # self.lock_required = False
[docs] def coords_types_prob_widths(self, sp_slices_): """ Calculates average coordinates, type and width in given chunk. Parameter :attr:`sp_slices_` is tuple of length equal to number of spaths. It contains slices for all spaths respectively. With these slices spaths are cut and **only** resulting chunks are used for calculations. Therefore, this method average spaths in one point of master math. This point is defined by slices submitted as :attr:`sp_lices_` parameter. Algorithm of averaging (within current chunks of spaths): #. Coordinates for all spaths are collected. #. Lengths of all spaths are collected (from cached variables) and kept as lists of lengths equal to chunks' sizes. .. note:: Lengths of collected lengths of spaths are of the same size as coordinates #. New coordinates are calculated as weighted average of collected coordintates with :func:`numpy.average`. As weights collected lengths are used. .. note:: Function :func:`numpy.average` is called with flatten coordinates and lengths. #. Width of average path is calculated as mean value of flatten coordinates mutual distances. #. Type of average paths is calculated as probability (frequency) of :attr:`~aquaduct.traj.paths.GenericPathTypeCodes.scope_name`. :param tuple sp_slices_: Slices that cut chunks from all paths. :rtype: 3 element tuple :return: coordinates, type (frequency), and width of averaged spaths in current point """ # get zz coords, zz means zip_zip - for all spaths coords_zz = [] for sp, sl in zip(self.spaths, sp_slices_): # self.lock.acquire() # coords_zz_element = sp.get_coords_cont(smooth=self.smooth) # coords_zz.append(coords_zz_element[sl]) coords_zz.append(sp.get_coords_cont(smooth=self.smooth)[sl]) # self.lock.release() # make lens_zz which are lens corrected to the lenghts of coords_zz and normalized to zip_zip number of obejcts lens_zz = [] for l, coord_z in zip(self.lens_cache, coords_zz): # l is lenght for one spath # coord_z are coordinates of this path (sliced to current chunk) if len(coord_z) > 0: lens_zz.append([float(l) / len(coord_z)] * len(coord_z)) # normalize and correct lengths else: # lens_zz.append([float(l)] * len(coord_z)) lens_zz.append([]) # here we have coords_zz, lens_zz # and we can calculate coords, types_prob, widths # concatenate zip_zip coords and lens coords_zz_cat = list(concatenate(*coords_zz)) del coords_zz lens_zz_cat = list(concatenate(*lens_zz)) del lens_zz # average coords_zz_cat using weights of lens_zz_cat coords_to_append = make_default_array(np.average(coords_zz_cat, axis=0, weights=lens_zz_cat)) del lens_zz_cat # calculate widths if len(self.spaths) > 1: # is the len of coords_zz the same as sp_slices_ and self.spaths? widths_to_append = make_default_array( np.mean(pdist(coords_zz_cat, 'euclidean'))) # TODO: this is probably the reason for memory hunger else: widths_to_append = 0. del coords_zz_cat # concatenate zip_zip gtypes types_zz_cat = list(concatenate(*[sp.gtypes_cont[sl] for sp, sl in zip(self.spaths, sp_slices_)])) del sp_slices_ # append type porbability to types types_to_append = float(types_zz_cat.count(GenericPathTypeCodes.scope_name)) / len(types_zz_cat) return coords_to_append, types_to_append, widths_to_append
def __call__(self, nr_sp_slices_): """ Callable interface. :param tuple nr_sp_slices_: Two element tuple: nr and sp_slice """ return nr_sp_slices_[0], self.coords_types_prob_widths(nr_sp_slices_[-1])
[docs]class CTypeSpathsCollection(object): """ Object for grouping separate paths that belong to the same CType. Method :meth:`get_master_path` allows for calculation of average path. """ parts = (0, 1, 2) # spath parts ''' Enumeration of spath parts. ''' # takes group of paths belonging to one ctype and allows to get MasterPath def __init__(self, spaths=None, ctype=None, bias_long=5, pbar=None, threads=1): """ :param list spaths: List of separate paths. :param InletClusterGenericType ctype: CType of spaths. :param int bias_long: Bias towards long paths used in :meth:`lens_norm`. :param pbar: Progress bar object. :param int threads: Number of available threads. """ self.pbar = pbar self.threads = threads # self.threads = 1 # force one thread logger.debug("Threads passed %d", threads) self.spaths = spaths assert isinstance(ctype, InletClusterGenericType) or isinstance(ctype, InletClusterExtendedType) self.ctype = ctype self.bias_long = bias_long # precompute some values self.beat() with clui.tictoc('spaths props cache in %s' % str(self.ctype)): self.lens_cache = self.lens() self.lens_real_cache = self.lens_real() self.lens_norm_cache = self.lens_norm() self.full_size_cache = self.full_size() self.beat() self.manager = multiprocessing.Manager() self.lock = self.manager.Lock()
[docs] def beat(self): """ Touch progress bar, if any. """ if self.pbar is not None: self.pbar.heartbeat()
[docs] def update(self): """ Update progres bar by one, if any. """ if self.pbar is not None: self.pbar.next()
[docs] def lens(self): """ Returns total lengths of all paths. If ctype in #:# and not 0 and not None then take length of `object` part only. :return: Total (or `object` part) lengths of all paths. :rtype: numpy.ndarray """ def lens_object_full(): for sp in self.spaths: if isinstance(sp, PassingPath): yield float(sp.size) else: yield float(len(sp.types_object)) if self.ctype.input is not None: if self.ctype.input > 0: if self.ctype.input == self.ctype.output: return make_default_array(list(lens_object_full())) return make_default_array([float(sp.size) for sp in self.spaths])
[docs] def lens_norm(self): """ Returns normalized lengths calculated by :meth:`lens`. Applied normalization is twofold: #. All lengths are divided by maximal length, and #. All lengths are subjected to :func:`pow` function with p = :attr:`bias_long`. :return: Normalized total (or `object` part) lengths of all paths. :rtype: numpy.ndarray """ lens = self.lens() if np.max(lens) > 0: lens /= np.max(lens) # normalize return lens ** self.bias_long # bias to long paths
[docs] def lens_real(self): """ Returns real lengths of all paths. :return: Sizes of all paths. :rtype: list """ return [sp.size for sp in self.spaths]
[docs] def full_size(self): """ Returns desired size of master path. :return: Size of master path. :rtype: int """ # first check what is the size of paths in all parts and normalize and then scale them sizes = [] for part in self.parts: # lengths of all paths of part part lens = make_default_array([float(sp.sizes[part]) for sp in self.spaths]) if np.max(lens) > 0: lens /= np.max(lens) # normalization lens = lens # ** self.bias_long # scale them by increasing weights of long paths if sum(lens) == 0: sizes.append(0) else: # weighted average by paths lengths sizes.append(int(np.average([len(sp.types[part]) for sp in self.spaths], 0, lens))) logger.debug("Full size is %d.", max(30, sum(sizes) / 3)) return int(max(30, sum(sizes) / 3)) # total size (desired), min 30 - a good low limit default?
[docs] @staticmethod def simple_types_distribution(types): """ Calculates normalized sizes of incoming, object, and outgoing parts of spath using generic types. It is assumed that spath has object part. :param list types: List of generic types. :rtype: 3 element list :return: Normalized sizes of incomin, object, and outgoing parts. """ # possible types are: # GenericPathTypeCodes.object_name # GenericPathTypeCodes.scope_name td_in, td_obj, td_out = 0, 0, 0 sls = list(list_blocks_to_slices(types)) if GenericPathTypeCodes.scope_name in types[sls[0]]: # this is input part td_in = len(types[sls[0]]) if GenericPathTypeCodes.scope_name in types[sls[-1]]: # this is output part td_out = len(types[sls[-1]]) # the rest is object td_obj = len(types) - td_in - td_out return [float(x) / len(types) for x in (td_in, td_obj, td_out)]
[docs] def types_distribution(self): """ :rtype: numpy.matrix :return: median values of :meth:`simple_types_distribution` for all spaths. """ # make median distribuitions return np.matrix(make_default_array( np.median([self.simple_types_distribution(sp.gtypes_cont) for sp in self.spaths], axis=0)))
[docs] def types_prob_to_types(self, types_prob): """ Changes types probabilities as returned by :meth:`CTypeSpathsCollectionWorker.coords_types_prob_widths` to types. :param list types_prob: List of types probabilities. :rtype: list :return: List of :class:`~aquaduct.traj.paths.GenericPathTypeCodes`. """ # get proper types types_dist_orig = self.types_distribution() types_dist_range = list(set(types_prob)) types_thresholds = [] for t in types_dist_range: new_pro_types = [{True: GenericPathTypeCodes.scope_name, False: GenericPathTypeCodes.object_name}[typ >= t] for typ in types_prob] types_thresholds.append(make_default_array(cdist(np.matrix(self.simple_types_distribution(new_pro_types)), types_dist_orig, metric='euclidean'))) self.beat() # get threshold for which value of types_thresholds is smallest types = [{True: GenericPathTypeCodes.scope_name, False: GenericPathTypeCodes.object_name}[typ >= types_dist_range[np.argmin(types_thresholds)]] for typ in types_prob] return types
[docs] def get_master_path(self, smooth=None, resid=(0, 0)): """ .. _master_path_generation: Averages spaths into one master path. This is done in steps: #. Master path is an average of bunch of spaths. Its length is determined by :meth:`full_size` method. #. All spaths are then divided in to chunks according to :func:`~aquaduct.utils.helpers.xzip_xzip` function with :attr:`N` set to lenght of master path. This results in list of length equal to the length of master path. Elements of this lists are slice objects that can be used to slice spaths in appropriate chunks. #. Next, for each element of this list :meth:`CTypeSpathsCollectionWorker.coords_types_prob_widths` method is called. Types probabilities are changed to types wiht :meth:`types_prob_to_types`. #. Finally, all data are used to create appropriate :class:`MasterPath`. If this fails `None` is returned. :param Smooth smooth: Smoothing method. :param int resid: Residue ID of master path. :rtype: :class:`~aquaduct.traj.paths.MasterPath` :return: Average path as :class:`~aquaduct.traj.paths.MasterPath` object or `None` if creation of master path failed. """ # prepare worker worker = CTypeSpathsCollectionWorker(spaths=self.spaths, ctype=self.ctype, bias_long=self.bias_long, smooth=smooth, lock=self.lock) # add some spaths precalcualted properties to worker worker.lens_cache = self.lens_cache worker.lens_real_cache = self.lens_real_cache worker.lens_norm_cache = self.lens_norm_cache worker.full_size_cache = self.full_size_cache # worker.lock_required = GCS.cachemem or GCS.cachedir # desired full size of path full_size = self.full_size_cache # containers for coords, types and widths of master path coords = [None] * full_size types = [None] * full_size widths = [None] * full_size # pbar magic pbar_previous = 0 pbar_factor = float(len(self.spaths)) / full_size # create pool of workers - mapping function map_fun = map if self.threads > 1: pool = multiprocessing.Pool(self.threads) map_fun = pool.imap_unordered chunk_size = int(full_size / self.threads ** 2) if chunk_size == 0: chunk_size = 1 # map_fun = partial(pool.imap_unordered, chunksize=chunk_size) map_fun = partial(pool.imap, chunksize=chunk_size) # TODO: it is possible to add pbar support here! # maximal number of spath spath_nr_max = 0 # loop over results of workers calculations on xzip_xzip lens_real with N=full_size # 1. Lens_real (sizes of spaths) are submitted to xzip_xzip wih N=full size # For each spath there will be collection of N slices, each slice cuts some part of spath. # In consequence, all spaths will be cutted in tho N chunks and for each path chunk will be # of different size # 2. These slices are submitted to worker callable class. for pbar_nr, (spath_nr, (coords_, types_, widths_)) in enumerate( map_fun(worker, enumerate(xzip_xzip(*worker.lens_real_cache, N=full_size)))): coords[spath_nr] = coords_ types[spath_nr] = types_ widths[spath_nr] = widths_ spath_nr_max = max(spath_nr, spath_nr_max) pbar_current = int((pbar_nr + 1) * pbar_factor) if pbar_current > pbar_previous: pbar_previous = pbar_current self.update() # update progress bar else: self.beat() assert pbar_nr == spath_nr_max, "Internal error. Final global progress of master path generation not synced with maximal number of spath. Please send a bug report to developer(s): %s" % clui.mail if self.threads > 1: pool.close() pool.join() pool.terminate() del pool # at this stage we have coords, widths and types probability # get proper types with clui.tictoc('proper tests in %s' % str(self.ctype)): types = self.types_prob_to_types(types) # make frames frames = range(len(coords)) # finalize # max min frames min_pf = 0 max_pf = len(coords) - 1 if self.ctype is None: # this never happens because of assertion in __init__ min_pf = None max_pf = None else: if self.ctype.input is not None: min_pf = None if self.ctype.output is not None: max_pf = None with clui.tictoc('generic paths in %s' % str(self.ctype)): # get and populate GenericPath fsrs_cache.update({resid: FakeSingleResidueSelection(resid, frames, coords)}) gp = GenericPaths(resid, min_pf=min_pf, max_pf=max_pf) for t, f in zip(types, frames): # TODO: remove loop gp.add_type(f, t) # now try to get first SinglePath, if unable issue WARNING with clui.tictoc('separate paths in %s' % str(self.ctype)): try: sp = list(yield_single_paths([gp], passing=False))[0] except IndexError: logger.warning('No master path found for ctype %s' % str(self.ctype)) return None # finally get MasterPath and add widths mp = MasterPath(sp, single_res_selection=fsrs_cache[resid]) mp.add_width(widths) return mp
# fake single residue type like object from aquaduct.traj.sandwich import SingleResidueSelection from aquaduct.utils.helpers import arrayify class FakeSingleResidueSelection(SingleResidueSelection): def __init__(self, resid, frames, coords): super(FakeSingleResidueSelection, self).__init__(resid) self._frames = frames self._coords = coords @arrayify(shape=(None, 3)) def coords(self, frames): # return coords for frames # assume that frames are in _frames for f in frames: yield self._coords[f] # TODO: This part of the code is weak. Change it, here and as well as in sandwich. def coords_smooth(self, sranges, smooth): for srange in sranges: yield smooth(self.coords(srange.get())) def get_edges(self): return None