21#include <unordered_map>
25constexpr uint KMEANS_MAX_ITERATIONS = 1000;
27QString QgsKMeansClusteringAlgorithm::name()
const
29 return QStringLiteral(
"kmeansclustering" );
32QString QgsKMeansClusteringAlgorithm::displayName()
const
34 return QObject::tr(
"K-means clustering" );
37QStringList QgsKMeansClusteringAlgorithm::tags()
const
39 return QObject::tr(
"clustering,clusters,kmeans,points" ).split(
',' );
42QString QgsKMeansClusteringAlgorithm::group()
const
44 return QObject::tr(
"Vector analysis" );
47QString QgsKMeansClusteringAlgorithm::groupId()
const
49 return QStringLiteral(
"vectoranalysis" );
52void QgsKMeansClusteringAlgorithm::initAlgorithm(
const QVariantMap & )
57 QStringList initializationMethods;
58 initializationMethods << QObject::tr(
"Farthest points" )
59 << QObject::tr(
"K-means++" );
60 addParameter(
new QgsProcessingParameterEnum( QStringLiteral(
"METHOD" ), QObject::tr(
"Method" ), initializationMethods,
false, 0,
false ) );
62 auto fieldNameParam = std::make_unique<QgsProcessingParameterString>( QStringLiteral(
"FIELD_NAME" ), QObject::tr(
"Cluster field name" ), QStringLiteral(
"CLUSTER_ID" ) );
64 addParameter( fieldNameParam.release() );
65 auto sizeFieldNameParam = std::make_unique<QgsProcessingParameterString>( QStringLiteral(
"SIZE_FIELD_NAME" ), QObject::tr(
"Cluster size field name" ), QStringLiteral(
"CLUSTER_SIZE" ) );
67 addParameter( sizeFieldNameParam.release() );
72QString QgsKMeansClusteringAlgorithm::shortHelpString()
const
74 return QObject::tr(
"This algorithm calculates the 2D distance based k-means cluster number for each input feature.\n\n"
75 "If input geometries are lines or polygons, the clustering is based on the centroid of the feature.\n\n"
77 "Arthur, David & Vassilvitskii, Sergei. (2007). K-Means++: The Advantages of Careful Seeding. Proc. of the Annu. ACM-SIAM Symp. on Discrete Algorithms. 8.\n\n"
78 "Bhattacharya, Anup & Eube, Jan & Röglin, Heiko & Schmidt, Melanie. (2019). Noisy, Greedy and Not So Greedy k-means++" );
81QString QgsKMeansClusteringAlgorithm::shortDescription()
const
83 return QObject::tr(
"Calculates the 2D distance based k-means cluster number for each input feature." );
86QgsKMeansClusteringAlgorithm *QgsKMeansClusteringAlgorithm::createInstance()
const
88 return new QgsKMeansClusteringAlgorithm();
93 std::unique_ptr<QgsProcessingFeatureSource> source( parameterAsSource( parameters, QStringLiteral(
"INPUT" ), context ) );
97 int k = parameterAsInt( parameters, QStringLiteral(
"CLUSTERS" ), context );
98 int initializationMethod = parameterAsInt( parameters, QStringLiteral(
"METHOD" ), context );
100 QgsFields outputFields = source->fields();
102 const QString clusterFieldName = parameterAsString( parameters, QStringLiteral(
"FIELD_NAME" ), context );
103 newFields.
append(
QgsField( clusterFieldName, QMetaType::Type::Int ) );
104 const QString clusterSizeFieldName = parameterAsString( parameters, QStringLiteral(
"SIZE_FIELD_NAME" ), context );
105 newFields.
append(
QgsField( clusterSizeFieldName, QMetaType::Type::Int ) );
109 std::unique_ptr<QgsFeatureSink> sink( parameterAsSink( parameters, QStringLiteral(
"OUTPUT" ), context, dest, outputFields, source->wkbType(), source->sourceCrs() ) );
114 feedback->
pushInfo( QObject::tr(
"Collecting input points" ) );
115 const double step = source->featureCount() > 0 ? 50.0 /
static_cast< double >( source->featureCount() ) : 1;
118 int featureWithGeometryCount = 0;
121 std::vector<Feature> clusterFeatures;
123 QHash<QgsFeatureId, std::size_t> idToObj;
135 featureWithGeometryCount++;
151 idToObj[feat.
id()] = clusterFeatures.size();
152 clusterFeatures.emplace_back( Feature( point ) );
157 feedback->
reportError( QObject::tr(
"Number of geometries is less than the number of clusters requested, not all clusters will get data" ) );
163 feedback->
pushInfo( QObject::tr(
"Calculating clusters" ) );
166 std::vector<QgsPointXY> centers( k );
167 switch ( initializationMethod )
170 initClustersFarthestPoints( clusterFeatures, centers, k, feedback );
173 initClustersPlusPlus( clusterFeatures, centers, k, feedback );
178 calculateKMeans( clusterFeatures, centers, k, feedback );
182 std::unordered_map<int, int> clusterSize;
183 for (
auto it = idToObj.constBegin(); it != idToObj.constEnd(); ++it )
185 clusterSize[clusterFeatures[it.value()].cluster]++;
188 features = source->getFeatures();
200 const auto obj = idToObj.find( feat.
id() );
203 attr << QVariant() << QVariant();
207 attr << 0 << featureWithGeometryCount;
211 const int cluster = clusterFeatures[*obj].cluster;
212 attr << cluster << clusterSize[cluster];
222 outputs.insert( QStringLiteral(
"OUTPUT" ), dest );
228void QgsKMeansClusteringAlgorithm::initClustersFarthestPoints( std::vector<Feature> &points, std::vector<QgsPointXY> ¢ers,
const int k,
QgsProcessingFeedback *feedback )
230 const std::size_t n = points.size();
236 for (
int i = 0; i < k; i++ )
237 centers[i] = points[0].point;
241 std::size_t duplicateCount = 1;
245 double distanceP1 = 0;
246 double distanceP2 = 0;
247 double maxDistance = -1;
248 for ( std::size_t i = 1; i < n; i++ )
250 distanceP1 = points[i].point.sqrDist( points[p1].point );
251 distanceP2 = points[i].point.sqrDist( points[p2].point );
254 if ( ( distanceP1 > maxDistance ) || ( distanceP2 > maxDistance ) )
256 maxDistance = std::max( distanceP1, distanceP2 );
257 if ( distanceP1 > distanceP2 )
268 if ( feedback && duplicateCount > 1 )
270 feedback->
pushWarning( QObject::tr(
"There are at least %n duplicate input(s), the number of output clusters may be less than was requested",
nullptr,
static_cast< int >( duplicateCount ) ) );
277 centers[0] = points[p1].point;
278 centers[1] = points[p2].point;
283 std::vector<double> distances( n );
286 for ( std::size_t j = 0; j < n; j++ )
288 distances[j] = points[j].point.sqrDist( centers[0] );
294 for (
int i = 2; i < k; i++ )
296 std::size_t candidateCenter = 0;
297 double maxDistance = std::numeric_limits<double>::lowest();
300 for ( std::size_t j = 0; j < n; j++ )
303 if ( distances[j] < 0 )
307 distances[j] = std::min( points[j].point.sqrDist( centers[i - 1] ), distances[j] );
310 if ( distances[j] > maxDistance )
313 maxDistance = distances[j];
318 Q_ASSERT( maxDistance >= 0 );
321 distances[candidateCenter] = -1;
323 centers[i] = points[candidateCenter].point;
328void QgsKMeansClusteringAlgorithm::initClustersPlusPlus( std::vector<Feature> &points, std::vector<QgsPointXY> ¢ers,
const int k,
QgsProcessingFeedback *feedback )
330 const std::size_t n = points.size();
336 for (
int i = 0; i < k; i++ )
337 centers[i] = points[0].point;
342 std::random_device rd;
343 std::mt19937 gen( rd() );
344 std::uniform_int_distribution<size_t> distrib( 0, n - 1 );
346 std::size_t p1 = distrib( gen );
347 centers[0] = points[p1].point;
350 std::vector<double> distances( n );
351 double totalError = 0;
352 std::size_t duplicateCount = 1;
353 for (
size_t i = 0; i < n; i++ )
355 double distance = points[i].point.sqrDist( centers[0] );
356 distances[i] = distance;
357 totalError += distance;
363 if ( feedback && duplicateCount > 1 )
365 feedback->
pushWarning( QObject::tr(
"There are at least %n duplicate input(s), the number of output clusters may be less than was requested",
nullptr,
static_cast< int >( duplicateCount ) ) );
373 unsigned int numCandidateCenters = 2 +
static_cast< int >( std::floor( std::log( k ) ) );
374 std::vector<double> randomNumbers( numCandidateCenters );
375 std::vector<size_t> candidateCenters( numCandidateCenters );
377 std::uniform_real_distribution<double> dis( 0.0, 1.0 );
378 for (
int i = 1; i < k; i++ )
381 for (
unsigned int j = 0; j < numCandidateCenters; j++ )
383 randomNumbers[j] = dis( gen ) * totalError;
387 std::vector<double> cumSum = distances;
388 for (
size_t j = 1; j < n; j++ )
390 cumSum[j] += cumSum[j - 1];
394 for (
unsigned int j = 0; j < numCandidateCenters; j++ )
399 while ( low <= high )
401 size_t mid = low + ( high - low ) / 2;
402 if ( cumSum[mid] < randomNumbers[j] )
420 candidateCenters[j] = low;
423 std::vector<std::vector<double>> distancesCandidateCenters( numCandidateCenters, std::vector<double>( n ) );
427 double currentError = 0;
428 double lowestError = std::numeric_limits<double>::max();
429 unsigned int bestCandidateIndex = 0;
430 for (
unsigned int j = 0; j < numCandidateCenters; j++ )
432 for (
size_t z = 0; z < n; z++ )
435 double distance = points[candidateCenters[j]].point.sqrDist( points[z].point );
437 if ( distance > distances[z] )
439 distance = distances[z];
441 distancesCandidateCenters[j][z] = distance;
442 currentError += distance;
444 if ( lowestError > currentError )
446 lowestError = currentError;
447 bestCandidateIndex = j;
452 for (
size_t j = 0; j < n; j++ )
454 distances[j] = distancesCandidateCenters[bestCandidateIndex][j];
457 centers[i] = points[candidateCenters[bestCandidateIndex]].point;
459 totalError = lowestError;
465void QgsKMeansClusteringAlgorithm::calculateKMeans( std::vector<QgsKMeansClusteringAlgorithm::Feature> &objs, std::vector<QgsPointXY> ¢ers,
int k,
QgsProcessingFeedback *feedback )
467 int converged =
false;
468 bool changed =
false;
471 std::vector<uint> weights( k );
474 for ( i = 0; i < KMEANS_MAX_ITERATIONS && !converged; i++ )
479 findNearest( objs, centers, k, changed );
480 updateMeans( objs, centers, weights, k );
481 converged = !changed;
484 if ( !converged && feedback )
485 feedback->
reportError( QObject::tr(
"Clustering did not converge after %n iteration(s)",
nullptr,
static_cast<int>( i ) ) );
487 feedback->
pushInfo( QObject::tr(
"Clustering converged after %n iteration(s)",
nullptr,
static_cast<int>( i ) ) );
492void QgsKMeansClusteringAlgorithm::findNearest( std::vector<QgsKMeansClusteringAlgorithm::Feature> &points,
const std::vector<QgsPointXY> ¢ers,
const int k,
bool &changed )
495 const std::size_t n = points.size();
496 for ( std::size_t i = 0; i < n; i++ )
498 Feature &point = points[i];
501 double currentDistance = point.point.sqrDist( centers[0] );
502 int currentCluster = 0;
505 for (
int cluster = 1; cluster < k; cluster++ )
507 const double distance = point.point.sqrDist( centers[cluster] );
508 if ( distance < currentDistance )
510 currentDistance = distance;
511 currentCluster = cluster;
516 if ( point.cluster != currentCluster )
519 point.cluster = currentCluster;
526void QgsKMeansClusteringAlgorithm::updateMeans(
const std::vector<Feature> &points, std::vector<QgsPointXY> ¢ers, std::vector<uint> &weights,
const int k )
528 const uint n = points.size();
529 std::fill( weights.begin(), weights.end(), 0 );
530 for (
int i = 0; i < k; i++ )
532 centers[i].setX( 0.0 );
533 centers[i].setY( 0.0 );
535 for ( uint i = 0; i < n; i++ )
537 const int cluster = points[i].cluster;
538 centers[cluster] +=
QgsVector( points[i].point.x(), points[i].point.y() );
539 weights[cluster] += 1;
541 for (
int i = 0; i < k; i++ )
543 centers[i] /= weights[i];
@ VectorAnyGeometry
Any vector layer with geometry.
@ Advanced
Parameter is an advanced parameter which should be hidden from users by default.
Wrapper for iterator of features from vector data provider or vector layer.
bool nextFeature(QgsFeature &f)
Fetch next feature and stores in f, returns true on success.
Wraps a request for features to a vector layer (or directly its vector data provider).
@ FastInsert
Use faster inserts, at the cost of updating the passed features to reflect changes made at the provid...
The feature class encapsulates a single feature including its unique ID, geometry and a list of field...
void setAttributes(const QgsAttributes &attrs)
Sets the feature's attributes.
bool hasGeometry() const
Returns true if the feature has an associated geometry.
bool isCanceled() const
Tells whether the operation has been canceled already.
void setProgress(double progress)
Sets the current progress for the feedback object.
Encapsulate a field in an attribute table or data source.
Container of fields for a vector layer.
bool append(const QgsField &field, Qgis::FieldOrigin origin=Qgis::FieldOrigin::Provider, int originIndex=-1)
Appends a field.
A geometry is the spatial representation of a feature.
const QgsAbstractGeometry * constGet() const
Returns a non-modifiable (const) reference to the underlying abstract geometry primitive.
QgsGeometry centroid() const
Returns the center of mass of a geometry.
Qgis::WkbType wkbType() const
Returns type of the geometry as a WKB type (point / linestring / polygon etc.).
Contains information about the context in which a processing algorithm is executed.
Custom exception class for processing related exceptions.
Base class for providing feedback from a processing algorithm.
virtual void pushInfo(const QString &info)
Pushes a general informational message from the algorithm.
virtual void pushWarning(const QString &warning)
Pushes a warning informational message from the algorithm.
virtual void reportError(const QString &error, bool fatalError=false)
Reports that the algorithm encountered an error while executing.
An enum based parameter for processing algorithms, allowing for selection from predefined values.
A feature sink output for processing algorithms.
An input feature source (such as vector layers) parameter for processing algorithms.
A numeric parameter for processing algorithms.
static QgsFields combineFields(const QgsFields &fieldsA, const QgsFields &fieldsB, const QString &fieldsBPrefix=QString())
Combines two field lists, avoiding duplicate field names (in a case-insensitive manner).
Represent a 2-dimensional vector.
static Qgis::WkbType flatType(Qgis::WkbType type)
Returns the flat type for a WKB type.
bool qgsDoubleNear(double a, double b, double epsilon=4 *std::numeric_limits< double >::epsilon())
Compare two doubles (but allow some difference).
T qgsgeometry_cast(QgsAbstractGeometry *geom)