#include <assert.h>#include <float.h>#include "mbed.h"#include "muscle_tree.h"#include "seq.h"#include "symmatrix.h"#include "ktuple_pair.h"#include "tree.h"#include "util.h"#include "progress.h"#include "list.h"#include "log.h"#include "kmpp/KMeans.h"#include "mbed.h"Data Structures | |
| struct | bisecting_kmeans_result_t |
Defines | |
| #define | TIMING 0 |
| #define | FULL_WITHIN_CLUSTER_DISTANCES 1 |
| #define | COMPUTE_WITHIN_SUBCLUSTER_AVERAGE 0 |
| #define | USE_KMEANS_LLOYDS 0 |
| #define | log2(x) (log(x) / 0.69314718055994530942) |
| #define | NUMBER_OF_SEEDS(n) pow(log2(((double)n)), 2) |
| #define | SEED_SELECTION SELECT_SEEDS_BY_LENGTH |
| #define | USE_EUCLIDEAN_DISTANCE 1 |
| #define | PRINT_CLUSTER_DISTRIBUTION 0 |
| #define | TRACE 0 |
Enumerations | |
| enum | SEED_SELECTION_TYPE { SELECT_SEEDS_RANDOMLY, SELECT_SEEDS_BY_LENGTH } |
Functions | |
| void | FreeKMeansResult (bisecting_kmeans_result_t **prResult_p) |
| Free KMeans result structure. | |
| void | NewKMeansResult (bisecting_kmeans_result_t **prKMeansResult_p) |
| Allocate new KMeans result. | |
| double | EuclDist (const double *v1, const double *v2, const int dim) |
| Calculate the euclidean distance between two vectors. | |
| double | CosDist (const double *v1, const double *v2, const int dim) |
| Calculate the cosine distance between two vectors. | |
| int | SeqToVec (double **ppdSeqVec, mseq_t *prMSeq, int *piSeeds, const int iNumSeeds, const int iPairDistType) |
| convert sequences into mbed-like (distance) vector representation. Seeds (prMSeq sequence indices) have to be picked before | |
| int | SeedSelection (int *piSeeds, int iNumSeeds, int iSelectionMethod, mseq_t *prMSeq) |
| Select seeds to be used from an prMSeq. | |
| void | BisectingKmeans (bisecting_kmeans_result_t **prKMeansResult_p, const int iNObjs, const int iDim, double **ppdVectors, const int iMinRequiredObjsPerCluster, const int iMaxAllowedObjsPerCluster) |
| Bisecting K-Means clustering. Repeatedly calls K-Means with a K of 2 until no cluster has more than iMaxAllowedObjsPerCluster. | |
| int | Mbed (tree_t **prMbedTree_p, mseq_t *prMSeq, const int iPairDistType, const char *pcGuidetreeOut) |
| From scratch reimplementation of mBed: Blackshields et al. (2010); PMID 20470396. | |
| #define COMPUTE_WITHIN_SUBCLUSTER_AVERAGE 0 |
| #define FULL_WITHIN_CLUSTER_DISTANCES 1 |
| #define log2 | ( | x | ) | (log(x) / 0.69314718055994530942) |
| #define NUMBER_OF_SEEDS | ( | n | ) | pow(log2(((double)n)), 2) |
| #define PRINT_CLUSTER_DISTRIBUTION 0 |
| #define SEED_SELECTION SELECT_SEEDS_BY_LENGTH |
| #define TIMING 0 |
| #define TRACE 0 |
| #define USE_EUCLIDEAN_DISTANCE 1 |
| #define USE_KMEANS_LLOYDS 0 |
| enum SEED_SELECTION_TYPE |
| void BisectingKmeans | ( | bisecting_kmeans_result_t ** | prKMeansResult_p, | |
| const int | iNObjs, | |||
| const int | iDim, | |||
| double ** | ppdVectors, | |||
| const int | iMinRequiredObjsPerCluster, | |||
| const int | iMaxAllowedObjsPerCluster | |||
| ) |
Bisecting K-Means clustering. Repeatedly calls K-Means with a K of 2 until no cluster has more than iMaxAllowedObjsPerCluster.
| [out] | prKMeansResult_p | Result of Bisecting KMeans. Will be allocated here. Caller has to free. See |
| [in] | iNObjs | Number of objects/sequences to cluster |
| [in] | iDim | Dimensionality of input data |
| [in] | ppdVectors | each row holds iDim points for this object's coordinates |
| [in] | iMinRequiredObjsPerCluster | Minimum number of objects per Cluster (inclusive)/ |
| [in] | iMaxAllowedObjsPerCluster | Maximum number of objects per Cluster (inclusive). Function returns once no cluster contains more then this number of objects. Soft limit! |
| double CosDist | ( | const double * | v1, | |
| const double * | v2, | |||
| const int | dim | |||
| ) |
Calculate the cosine distance between two vectors.
| [in] | v1 | First vector with dim dimensions |
| [in] | v2 | Second vector with dim dimensions |
| [in] | dim | Dimensionality of v1 and v2 |
| double EuclDist | ( | const double * | v1, | |
| const double * | v2, | |||
| const int | dim | |||
| ) |
Calculate the euclidean distance between two vectors.
| [in] | v1 | First vector with dim dimensions |
| [in] | v2 | Second vector with dim dimensions |
| [in] | dim | Dimensionality of v1 and v2 |
| void FreeKMeansResult | ( | bisecting_kmeans_result_t ** | prResult_p | ) |
Free KMeans result structure.
| [out] | prResult_p | K-Means result to free |
| int Mbed | ( | tree_t ** | prMbedTree_p, | |
| mseq_t * | prMSeq, | |||
| const int | iPairDistType, | |||
| const char * | pcGuidetreeOut | |||
| ) |
From scratch reimplementation of mBed: Blackshields et al. (2010); PMID 20470396.
Idea is a follows:
| [out] | prMbedTree_p | Created upgma tree. will be allocated here. use FreeMuscleTree() to free |
| [in] | prMSeq | Multiple sequences |
| [in] | iPairDistType | Distance measure for pairwise alignments |
| [in] | pcGuidetreeOut | Passed down to GuideTreeUpgma() |
| void NewKMeansResult | ( | bisecting_kmeans_result_t ** | prKMeansResult_p | ) |
Allocate new KMeans result.
| [out] | prKMeansResult_p | K-Means result to free |
| int SeedSelection | ( | int * | piSeeds, | |
| int | iNumSeeds, | |||
| int | iSelectionMethod, | |||
| mseq_t * | prMSeq | |||
| ) |
Select seeds to be used from an prMSeq.
| [out] | piSeeds | Will store the indices of prMSeqs seqs used to be as seeds here. Must be preallocated. |
| [in] | iNumSeeds | Number of seeds to be picked |
| [in] | iSelectionMethod | Seed selection method to be used |
| [in] | prMSeq | The prMSeq structure to pick sequences from |
| int SeqToVec | ( | double ** | ppdSeqVec, | |
| mseq_t * | prMSeq, | |||
| int * | piSeeds, | |||
| const int | iNumSeeds, | |||
| const int | iPairDistType | |||
| ) |
convert sequences into mbed-like (distance) vector representation. Seeds (prMSeq sequence indices) have to be picked before
| [out] | ppdSeqVec | Pointer to preallocated matrix of size nseqs x iSeeds |
| [in] | prMSeq | Sequences which are to be converted |
| [in] | piSeeds | Array of sequences in prMSeq which are to be used as seeds. |
| [in] | iNumSeeds | Number of seeds/elements in piSeeds |
| [in] | iPairDistType | Type of pairwise distance comparison |
1.6.3