#include <assert.h>#include <float.h>#include "mbed.h"#include "muscle_tree.h"#include "seq.h"#include "symmatrix.h"#include "ktuple_pair.h"#include "tree.h"#include "util.h"#include "progress.h"#include "list.h"#include "log.h"#include "kmpp/KMeans.h"#include "mbed.h"| Data Structures | |
| struct | bisecting_kmeans_result_t | 
| Defines | |
| #define | TIMING 0 | 
| #define | FULL_WITHIN_CLUSTER_DISTANCES 1 | 
| #define | COMPUTE_WITHIN_SUBCLUSTER_AVERAGE 0 | 
| #define | USE_KMEANS_LLOYDS 0 | 
| #define | log2(x) (log(x) / 0.69314718055994530942) | 
| #define | NUMBER_OF_SEEDS(n) pow(log2(((double)n)), 2) | 
| #define | SEED_SELECTION SELECT_SEEDS_BY_LENGTH | 
| #define | USE_EUCLIDEAN_DISTANCE 1 | 
| #define | PRINT_CLUSTER_DISTRIBUTION 0 | 
| #define | TRACE 0 | 
| Enumerations | |
| enum | SEED_SELECTION_TYPE { SELECT_SEEDS_RANDOMLY, SELECT_SEEDS_BY_LENGTH } | 
| Functions | |
| void | FreeKMeansResult (bisecting_kmeans_result_t **prResult_p) | 
| Free KMeans result structure. | |
| void | NewKMeansResult (bisecting_kmeans_result_t **prKMeansResult_p) | 
| Allocate new KMeans result. | |
| double | EuclDist (const double *v1, const double *v2, const int dim) | 
| Calculate the euclidean distance between two vectors. | |
| double | CosDist (const double *v1, const double *v2, const int dim) | 
| Calculate the cosine distance between two vectors. | |
| int | SeqToVec (double **ppdSeqVec, mseq_t *prMSeq, int *piSeeds, const int iNumSeeds, const int iPairDistType) | 
| convert sequences into mbed-like (distance) vector representation. Seeds (prMSeq sequence indices) have to be picked before | |
| int | SeedSelection (int *piSeeds, int iNumSeeds, int iSelectionMethod, mseq_t *prMSeq) | 
| Select seeds to be used from an prMSeq. | |
| void | BisectingKmeans (bisecting_kmeans_result_t **prKMeansResult_p, const int iNObjs, const int iDim, double **ppdVectors, const int iMinRequiredObjsPerCluster, const int iMaxAllowedObjsPerCluster) | 
| Bisecting K-Means clustering. Repeatedly calls K-Means with a K of 2 until no cluster has more than iMaxAllowedObjsPerCluster. | |
| int | Mbed (tree_t **prMbedTree_p, mseq_t *prMSeq, const int iPairDistType, const char *pcGuidetreeOut) | 
| From scratch reimplementation of mBed: Blackshields et al. (2010); PMID 20470396. | |
| #define COMPUTE_WITHIN_SUBCLUSTER_AVERAGE 0 | 
| #define FULL_WITHIN_CLUSTER_DISTANCES 1 | 
| #define log2 | ( | x | ) | (log(x) / 0.69314718055994530942) | 
| #define NUMBER_OF_SEEDS | ( | n | ) | pow(log2(((double)n)), 2) | 
| #define PRINT_CLUSTER_DISTRIBUTION 0 | 
| #define SEED_SELECTION SELECT_SEEDS_BY_LENGTH | 
| #define TIMING 0 | 
| #define TRACE 0 | 
| #define USE_EUCLIDEAN_DISTANCE 1 | 
| #define USE_KMEANS_LLOYDS 0 | 
| enum SEED_SELECTION_TYPE | 
| void BisectingKmeans | ( | bisecting_kmeans_result_t ** | prKMeansResult_p, | |
| const int | iNObjs, | |||
| const int | iDim, | |||
| double ** | ppdVectors, | |||
| const int | iMinRequiredObjsPerCluster, | |||
| const int | iMaxAllowedObjsPerCluster | |||
| ) | 
Bisecting K-Means clustering. Repeatedly calls K-Means with a K of 2 until no cluster has more than iMaxAllowedObjsPerCluster.
| [out] | prKMeansResult_p | Result of Bisecting KMeans. Will be allocated here. Caller has to free. See | 
| [in] | iNObjs | Number of objects/sequences to cluster | 
| [in] | iDim | Dimensionality of input data | 
| [in] | ppdVectors | each row holds iDim points for this object's coordinates | 
| [in] | iMinRequiredObjsPerCluster | Minimum number of objects per Cluster (inclusive)/ | 
| [in] | iMaxAllowedObjsPerCluster | Maximum number of objects per Cluster (inclusive). Function returns once no cluster contains more then this number of objects. Soft limit! | 
| double CosDist | ( | const double * | v1, | |
| const double * | v2, | |||
| const int | dim | |||
| ) | 
Calculate the cosine distance between two vectors.
| [in] | v1 | First vector with dim dimensions | 
| [in] | v2 | Second vector with dim dimensions | 
| [in] | dim | Dimensionality of v1 and v2 | 
| double EuclDist | ( | const double * | v1, | |
| const double * | v2, | |||
| const int | dim | |||
| ) | 
Calculate the euclidean distance between two vectors.
| [in] | v1 | First vector with dim dimensions | 
| [in] | v2 | Second vector with dim dimensions | 
| [in] | dim | Dimensionality of v1 and v2 | 
| void FreeKMeansResult | ( | bisecting_kmeans_result_t ** | prResult_p | ) | 
Free KMeans result structure.
| [out] | prResult_p | K-Means result to free | 
| int Mbed | ( | tree_t ** | prMbedTree_p, | |
| mseq_t * | prMSeq, | |||
| const int | iPairDistType, | |||
| const char * | pcGuidetreeOut | |||
| ) | 
From scratch reimplementation of mBed: Blackshields et al. (2010); PMID 20470396.
Idea is a follows:
| [out] | prMbedTree_p | Created upgma tree. will be allocated here. use FreeMuscleTree() to free | 
| [in] | prMSeq | Multiple sequences | 
| [in] | iPairDistType | Distance measure for pairwise alignments | 
| [in] | pcGuidetreeOut | Passed down to GuideTreeUpgma() | 
| void NewKMeansResult | ( | bisecting_kmeans_result_t ** | prKMeansResult_p | ) | 
Allocate new KMeans result.
| [out] | prKMeansResult_p | K-Means result to free | 
| int SeedSelection | ( | int * | piSeeds, | |
| int | iNumSeeds, | |||
| int | iSelectionMethod, | |||
| mseq_t * | prMSeq | |||
| ) | 
Select seeds to be used from an prMSeq.
| [out] | piSeeds | Will store the indices of prMSeqs seqs used to be as seeds here. Must be preallocated. | 
| [in] | iNumSeeds | Number of seeds to be picked | 
| [in] | iSelectionMethod | Seed selection method to be used | 
| [in] | prMSeq | The prMSeq structure to pick sequences from | 
| int SeqToVec | ( | double ** | ppdSeqVec, | |
| mseq_t * | prMSeq, | |||
| int * | piSeeds, | |||
| const int | iNumSeeds, | |||
| const int | iPairDistType | |||
| ) | 
convert sequences into mbed-like (distance) vector representation. Seeds (prMSeq sequence indices) have to be picked before
| [out] | ppdSeqVec | Pointer to preallocated matrix of size nseqs x iSeeds | 
| [in] | prMSeq | Sequences which are to be converted | 
| [in] | piSeeds | Array of sequences in prMSeq which are to be used as seeds. | 
| [in] | iNumSeeds | Number of seeds/elements in piSeeds | 
| [in] | iPairDistType | Type of pairwise distance comparison | 
 1.6.3
 1.6.3