QGIS API Documentation 3.99.0-Master (2fe06baccd8)
Loading...
Searching...
No Matches
qgsalgorithmkmeansclustering.cpp
Go to the documentation of this file.
1/***************************************************************************
2 qgsalgorithmkmeansclustering.cpp
3 ---------------------
4 begin : June 2018
5 copyright : (C) 2018 by Nyall Dawson
6 email : nyall dot dawson at gmail dot com
7 ***************************************************************************/
8
9/***************************************************************************
10 * *
11 * This program is free software; you can redistribute it and/or modify *
12 * it under the terms of the GNU General Public License as published by *
13 * the Free Software Foundation; either version 2 of the License, or *
14 * (at your option) any later version. *
15 * *
16 ***************************************************************************/
17
19
20#include <random>
21#include <unordered_map>
22
24
25constexpr uint KMEANS_MAX_ITERATIONS = 1000;
26
27QString QgsKMeansClusteringAlgorithm::name() const
28{
29 return QStringLiteral( "kmeansclustering" );
30}
31
32QString QgsKMeansClusteringAlgorithm::displayName() const
33{
34 return QObject::tr( "K-means clustering" );
35}
36
37QStringList QgsKMeansClusteringAlgorithm::tags() const
38{
39 return QObject::tr( "clustering,clusters,kmeans,points" ).split( ',' );
40}
41
42QString QgsKMeansClusteringAlgorithm::group() const
43{
44 return QObject::tr( "Vector analysis" );
45}
46
47QString QgsKMeansClusteringAlgorithm::groupId() const
48{
49 return QStringLiteral( "vectoranalysis" );
50}
51
52void QgsKMeansClusteringAlgorithm::initAlgorithm( const QVariantMap & )
53{
54 addParameter( new QgsProcessingParameterFeatureSource( QStringLiteral( "INPUT" ), QObject::tr( "Input layer" ), QList<int>() << static_cast<int>( Qgis::ProcessingSourceType::VectorAnyGeometry ) ) );
55 addParameter( new QgsProcessingParameterNumber( QStringLiteral( "CLUSTERS" ), QObject::tr( "Number of clusters" ), Qgis::ProcessingNumberParameterType::Integer, 5, false, 1 ) );
56
57 QStringList initializationMethods;
58 initializationMethods << QObject::tr( "Farthest points" )
59 << QObject::tr( "K-means++" );
60 addParameter( new QgsProcessingParameterEnum( QStringLiteral( "METHOD" ), QObject::tr( "Method" ), initializationMethods, false, 0, false ) );
61
62 auto fieldNameParam = std::make_unique<QgsProcessingParameterString>( QStringLiteral( "FIELD_NAME" ), QObject::tr( "Cluster field name" ), QStringLiteral( "CLUSTER_ID" ) );
63 fieldNameParam->setFlags( fieldNameParam->flags() | Qgis::ProcessingParameterFlag::Advanced );
64 addParameter( fieldNameParam.release() );
65 auto sizeFieldNameParam = std::make_unique<QgsProcessingParameterString>( QStringLiteral( "SIZE_FIELD_NAME" ), QObject::tr( "Cluster size field name" ), QStringLiteral( "CLUSTER_SIZE" ) );
66 sizeFieldNameParam->setFlags( sizeFieldNameParam->flags() | Qgis::ProcessingParameterFlag::Advanced );
67 addParameter( sizeFieldNameParam.release() );
68
69 addParameter( new QgsProcessingParameterFeatureSink( QStringLiteral( "OUTPUT" ), QObject::tr( "Clusters" ), Qgis::ProcessingSourceType::VectorAnyGeometry ) );
70}
71
72QString QgsKMeansClusteringAlgorithm::shortHelpString() const
73{
74 return QObject::tr( "This algorithm calculates the 2D distance based k-means cluster number for each input feature.\n\n"
75 "If input geometries are lines or polygons, the clustering is based on the centroid of the feature.\n\n"
76 "References:\n"
77 "Arthur, David & Vassilvitskii, Sergei. (2007). K-Means++: The Advantages of Careful Seeding. Proc. of the Annu. ACM-SIAM Symp. on Discrete Algorithms. 8.\n\n"
78 "Bhattacharya, Anup & Eube, Jan & Röglin, Heiko & Schmidt, Melanie. (2019). Noisy, Greedy and Not So Greedy k-means++" );
79}
80
81QString QgsKMeansClusteringAlgorithm::shortDescription() const
82{
83 return QObject::tr( "Calculates the 2D distance based k-means cluster number for each input feature." );
84}
85
86QgsKMeansClusteringAlgorithm *QgsKMeansClusteringAlgorithm::createInstance() const
87{
88 return new QgsKMeansClusteringAlgorithm();
89}
90
91QVariantMap QgsKMeansClusteringAlgorithm::processAlgorithm( const QVariantMap &parameters, QgsProcessingContext &context, QgsProcessingFeedback *feedback )
92{
93 std::unique_ptr<QgsProcessingFeatureSource> source( parameterAsSource( parameters, QStringLiteral( "INPUT" ), context ) );
94 if ( !source )
95 throw QgsProcessingException( invalidSourceError( parameters, QStringLiteral( "INPUT" ) ) );
96
97 int k = parameterAsInt( parameters, QStringLiteral( "CLUSTERS" ), context );
98 int initializationMethod = parameterAsInt( parameters, QStringLiteral( "METHOD" ), context );
99
100 QgsFields outputFields = source->fields();
101 QgsFields newFields;
102 const QString clusterFieldName = parameterAsString( parameters, QStringLiteral( "FIELD_NAME" ), context );
103 newFields.append( QgsField( clusterFieldName, QMetaType::Type::Int ) );
104 const QString clusterSizeFieldName = parameterAsString( parameters, QStringLiteral( "SIZE_FIELD_NAME" ), context );
105 newFields.append( QgsField( clusterSizeFieldName, QMetaType::Type::Int ) );
106 outputFields = QgsProcessingUtils::combineFields( outputFields, newFields );
107
108 QString dest;
109 std::unique_ptr<QgsFeatureSink> sink( parameterAsSink( parameters, QStringLiteral( "OUTPUT" ), context, dest, outputFields, source->wkbType(), source->sourceCrs() ) );
110 if ( !sink )
111 throw QgsProcessingException( invalidSinkError( parameters, QStringLiteral( "OUTPUT" ) ) );
112
113 // build list of point inputs - if it's already a point, use that. If not, take the centroid.
114 feedback->pushInfo( QObject::tr( "Collecting input points" ) );
115 const double step = source->featureCount() > 0 ? 50.0 / static_cast< double >( source->featureCount() ) : 1;
116 int i = 0;
117 int n = 0;
118 int featureWithGeometryCount = 0;
119 QgsFeature feat;
120
121 std::vector<Feature> clusterFeatures;
122 QgsFeatureIterator features = source->getFeatures( QgsFeatureRequest().setNoAttributes() );
123 QHash<QgsFeatureId, std::size_t> idToObj;
124 while ( features.nextFeature( feat ) )
125 {
126 i++;
127 if ( feedback->isCanceled() )
128 {
129 break;
130 }
131
132 feedback->setProgress( i * step );
133 if ( !feat.hasGeometry() )
134 continue;
135 featureWithGeometryCount++;
136
137 QgsPointXY point;
140 else
141 {
142 const QgsGeometry centroid = feat.geometry().centroid();
143 if ( centroid.isNull() )
144 continue; // centroid failed, e.g. empty linestring
145
147 }
148
149 n++;
150
151 idToObj[feat.id()] = clusterFeatures.size();
152 clusterFeatures.emplace_back( Feature( point ) );
153 }
154
155 if ( n < k )
156 {
157 feedback->reportError( QObject::tr( "Number of geometries is less than the number of clusters requested, not all clusters will get data" ) );
158 k = n;
159 }
160
161 if ( k > 1 )
162 {
163 feedback->pushInfo( QObject::tr( "Calculating clusters" ) );
164
165 // cluster centers
166 std::vector<QgsPointXY> centers( k );
167 switch ( initializationMethod )
168 {
169 case 0: // farthest points
170 initClustersFarthestPoints( clusterFeatures, centers, k, feedback );
171 break;
172 case 1: // k-means++
173 initClustersPlusPlus( clusterFeatures, centers, k, feedback );
174 break;
175 default:
176 break;
177 }
178 calculateKMeans( clusterFeatures, centers, k, feedback );
179 }
180
181 // cluster size
182 std::unordered_map<int, int> clusterSize;
183 for ( auto it = idToObj.constBegin(); it != idToObj.constEnd(); ++it )
184 {
185 clusterSize[clusterFeatures[it.value()].cluster]++;
186 }
187
188 features = source->getFeatures();
189 i = 0;
190 while ( features.nextFeature( feat ) )
191 {
192 i++;
193 if ( feedback->isCanceled() )
194 {
195 break;
196 }
197
198 feedback->setProgress( 50 + i * step );
199 QgsAttributes attr = feat.attributes();
200 const auto obj = idToObj.find( feat.id() );
201 if ( !feat.hasGeometry() || obj == idToObj.end() )
202 {
203 attr << QVariant() << QVariant();
204 }
205 else if ( k <= 1 )
206 {
207 attr << 0 << featureWithGeometryCount;
208 }
209 else
210 {
211 const int cluster = clusterFeatures[*obj].cluster;
212 attr << cluster << clusterSize[cluster];
213 }
214 feat.setAttributes( attr );
215 if ( !sink->addFeature( feat, QgsFeatureSink::FastInsert ) )
216 throw QgsProcessingException( writeFeatureError( sink.get(), parameters, QStringLiteral( "OUTPUT" ) ) );
217 }
218
219 sink->finalize();
220
221 QVariantMap outputs;
222 outputs.insert( QStringLiteral( "OUTPUT" ), dest );
223 return outputs;
224}
225
226// ported from https://github.com/postgis/postgis/blob/svn-trunk/liblwgeom/lwkmeans.c
227
228void QgsKMeansClusteringAlgorithm::initClustersFarthestPoints( std::vector<Feature> &points, std::vector<QgsPointXY> &centers, const int k, QgsProcessingFeedback *feedback )
229{
230 const std::size_t n = points.size();
231 if ( n == 0 )
232 return;
233
234 if ( n == 1 )
235 {
236 for ( int i = 0; i < k; i++ )
237 centers[i] = points[0].point;
238 return;
239 }
240
241 std::size_t duplicateCount = 1;
242 // initially scan for two most distance points from each other, p1 and p2
243 std::size_t p1 = 0;
244 std::size_t p2 = 0;
245 double distanceP1 = 0;
246 double distanceP2 = 0;
247 double maxDistance = -1;
248 for ( std::size_t i = 1; i < n; i++ )
249 {
250 distanceP1 = points[i].point.sqrDist( points[p1].point );
251 distanceP2 = points[i].point.sqrDist( points[p2].point );
252
253 // if this point is further then existing candidates, replace our choice
254 if ( ( distanceP1 > maxDistance ) || ( distanceP2 > maxDistance ) )
255 {
256 maxDistance = std::max( distanceP1, distanceP2 );
257 if ( distanceP1 > distanceP2 )
258 p2 = i;
259 else
260 p1 = i;
261 }
262
263 // also record count of duplicate points
264 if ( qgsDoubleNear( distanceP1, 0 ) || qgsDoubleNear( distanceP2, 0 ) )
265 duplicateCount++;
266 }
267
268 if ( feedback && duplicateCount > 1 )
269 {
270 feedback->pushWarning( QObject::tr( "There are at least %n duplicate input(s), the number of output clusters may be less than was requested", nullptr, static_cast< int >( duplicateCount ) ) );
271 }
272
273 // By now two points should be found and be not the same
274 // Q_ASSERT( p1 != p2 && maxDistance >= 0 );
275
276 // Accept these two points as our initial centers
277 centers[0] = points[p1].point;
278 centers[1] = points[p2].point;
279
280 if ( k > 2 )
281 {
282 // array of minimum distance to a point from accepted cluster centers
283 std::vector<double> distances( n );
284
285 // initialize array with distance to first object
286 for ( std::size_t j = 0; j < n; j++ )
287 {
288 distances[j] = points[j].point.sqrDist( centers[0] );
289 }
290 distances[p1] = -1;
291 distances[p2] = -1;
292
293 // loop i on clusters, skip 0 and 1 as found already
294 for ( int i = 2; i < k; i++ )
295 {
296 std::size_t candidateCenter = 0;
297 double maxDistance = std::numeric_limits<double>::lowest();
298
299 // loop j on points
300 for ( std::size_t j = 0; j < n; j++ )
301 {
302 // accepted clusters are already marked with distance = -1
303 if ( distances[j] < 0 )
304 continue;
305
306 // update minimal distance with previously accepted cluster
307 distances[j] = std::min( points[j].point.sqrDist( centers[i - 1] ), distances[j] );
308
309 // greedily take a point that's farthest from any of accepted clusters
310 if ( distances[j] > maxDistance )
311 {
312 candidateCenter = j;
313 maxDistance = distances[j];
314 }
315 }
316
317 // checked earlier by counting entries on input, just in case
318 Q_ASSERT( maxDistance >= 0 );
319
320 // accept candidate to centers
321 distances[candidateCenter] = -1;
322 // copy the point coordinates into the initial centers array
323 centers[i] = points[candidateCenter].point;
324 }
325 }
326}
327
328void QgsKMeansClusteringAlgorithm::initClustersPlusPlus( std::vector<Feature> &points, std::vector<QgsPointXY> &centers, const int k, QgsProcessingFeedback *feedback )
329{
330 const std::size_t n = points.size();
331 if ( n == 0 )
332 return;
333
334 if ( n == 1 )
335 {
336 for ( int i = 0; i < k; i++ )
337 centers[i] = points[0].point;
338 return;
339 }
340
341 // randomly select the first point
342 std::random_device rd;
343 std::mt19937 gen( rd() );
344 std::uniform_int_distribution<size_t> distrib( 0, n - 1 );
345
346 std::size_t p1 = distrib( gen );
347 centers[0] = points[p1].point;
348
349 // calculate distances and total error (sum of distances of points to center)
350 std::vector<double> distances( n );
351 double totalError = 0;
352 std::size_t duplicateCount = 1;
353 for ( size_t i = 0; i < n; i++ )
354 {
355 double distance = points[i].point.sqrDist( centers[0] );
356 distances[i] = distance;
357 totalError += distance;
358 if ( qgsDoubleNear( distance, 0 ) )
359 {
360 duplicateCount++;
361 }
362 }
363 if ( feedback && duplicateCount > 1 )
364 {
365 feedback->pushWarning( QObject::tr( "There are at least %n duplicate input(s), the number of output clusters may be less than was requested", nullptr, static_cast< int >( duplicateCount ) ) );
366 }
367
368 // greedy kmeans++
369 // test not only one center but L possible centers
370 // chosen independently according to the same probability distribution), and then among these L
371 // centers, the one that decreases the k-means cost the most is chosen
372 // Bhattacharya, Anup & Eube, Jan & Röglin, Heiko & Schmidt, Melanie. (2019). Noisy, greedy and Not So greedy k-means++
373 unsigned int numCandidateCenters = 2 + static_cast< int >( std::floor( std::log( k ) ) );
374 std::vector<double> randomNumbers( numCandidateCenters );
375 std::vector<size_t> candidateCenters( numCandidateCenters );
376
377 std::uniform_real_distribution<double> dis( 0.0, 1.0 );
378 for ( int i = 1; i < k; i++ )
379 {
380 // sampling with probability proportional to the squared distance to the closest existing center
381 for ( unsigned int j = 0; j < numCandidateCenters; j++ )
382 {
383 randomNumbers[j] = dis( gen ) * totalError;
384 }
385
386 // cumulative sum, keep distances for later use
387 std::vector<double> cumSum = distances;
388 for ( size_t j = 1; j < n; j++ )
389 {
390 cumSum[j] += cumSum[j - 1];
391 }
392
393 // binary search for the index of the first element greater than or equal to random numbers
394 for ( unsigned int j = 0; j < numCandidateCenters; j++ )
395 {
396 size_t low = 0;
397 size_t high = n - 1;
398
399 while ( low <= high )
400 {
401 size_t mid = low + ( high - low ) / 2;
402 if ( cumSum[mid] < randomNumbers[j] )
403 {
404 low = mid + 1;
405 }
406 else
407 {
408 // size_t cannot be negative
409 if ( mid == 0 )
410 break;
411
412 high = mid - 1;
413 }
414 }
415 // clip candidate center to the number of points
416 if ( low >= n )
417 {
418 low = n - 1;
419 }
420 candidateCenters[j] = low;
421 }
422
423 std::vector<std::vector<double>> distancesCandidateCenters( numCandidateCenters, std::vector<double>( n ) );
424 ;
425
426 // store distances between previous and new candidate center, error and best candidate index
427 double currentError = 0;
428 double lowestError = std::numeric_limits<double>::max();
429 unsigned int bestCandidateIndex = 0;
430 for ( unsigned int j = 0; j < numCandidateCenters; j++ )
431 {
432 for ( size_t z = 0; z < n; z++ )
433 {
434 // distance to candidate center
435 double distance = points[candidateCenters[j]].point.sqrDist( points[z].point );
436 // if distance to previous center is less than the current distance, use that
437 if ( distance > distances[z] )
438 {
439 distance = distances[z];
440 }
441 distancesCandidateCenters[j][z] = distance;
442 currentError += distance;
443 }
444 if ( lowestError > currentError )
445 {
446 lowestError = currentError;
447 bestCandidateIndex = j;
448 }
449 }
450
451 // update distances with the best candidate center values
452 for ( size_t j = 0; j < n; j++ )
453 {
454 distances[j] = distancesCandidateCenters[bestCandidateIndex][j];
455 }
456 // store the best candidate center
457 centers[i] = points[candidateCenters[bestCandidateIndex]].point;
458 // update error
459 totalError = lowestError;
460 }
461}
462
463// ported from https://github.com/postgis/postgis/blob/svn-trunk/liblwgeom/lwkmeans.c
464
465void QgsKMeansClusteringAlgorithm::calculateKMeans( std::vector<QgsKMeansClusteringAlgorithm::Feature> &objs, std::vector<QgsPointXY> &centers, int k, QgsProcessingFeedback *feedback )
466{
467 int converged = false;
468 bool changed = false;
469
470 // avoid reallocating weights array for every iteration
471 std::vector<uint> weights( k );
472
473 uint i = 0;
474 for ( i = 0; i < KMEANS_MAX_ITERATIONS && !converged; i++ )
475 {
476 if ( feedback && feedback->isCanceled() )
477 break;
478
479 findNearest( objs, centers, k, changed );
480 updateMeans( objs, centers, weights, k );
481 converged = !changed;
482 }
483
484 if ( !converged && feedback )
485 feedback->reportError( QObject::tr( "Clustering did not converge after %n iteration(s)", nullptr, static_cast<int>( i ) ) );
486 else if ( feedback )
487 feedback->pushInfo( QObject::tr( "Clustering converged after %n iteration(s)", nullptr, static_cast<int>( i ) ) );
488}
489
490// ported from https://github.com/postgis/postgis/blob/svn-trunk/liblwgeom/lwkmeans.c
491
492void QgsKMeansClusteringAlgorithm::findNearest( std::vector<QgsKMeansClusteringAlgorithm::Feature> &points, const std::vector<QgsPointXY> &centers, const int k, bool &changed )
493{
494 changed = false;
495 const std::size_t n = points.size();
496 for ( std::size_t i = 0; i < n; i++ )
497 {
498 Feature &point = points[i];
499
500 // Initialize with distance to first cluster
501 double currentDistance = point.point.sqrDist( centers[0] );
502 int currentCluster = 0;
503
504 // Check all other cluster centers and find the nearest
505 for ( int cluster = 1; cluster < k; cluster++ )
506 {
507 const double distance = point.point.sqrDist( centers[cluster] );
508 if ( distance < currentDistance )
509 {
510 currentDistance = distance;
511 currentCluster = cluster;
512 }
513 }
514
515 // Store the nearest cluster this object is in
516 if ( point.cluster != currentCluster )
517 {
518 changed = true;
519 point.cluster = currentCluster;
520 }
521 }
522}
523
524// ported from https://github.com/postgis/postgis/blob/svn-trunk/liblwgeom/lwkmeans.c
525
526void QgsKMeansClusteringAlgorithm::updateMeans( const std::vector<Feature> &points, std::vector<QgsPointXY> &centers, std::vector<uint> &weights, const int k )
527{
528 const uint n = points.size();
529 std::fill( weights.begin(), weights.end(), 0 );
530 for ( int i = 0; i < k; i++ )
531 {
532 centers[i].setX( 0.0 );
533 centers[i].setY( 0.0 );
534 }
535 for ( uint i = 0; i < n; i++ )
536 {
537 const int cluster = points[i].cluster;
538 centers[cluster] += QgsVector( points[i].point.x(), points[i].point.y() );
539 weights[cluster] += 1;
540 }
541 for ( int i = 0; i < k; i++ )
542 {
543 centers[i] /= weights[i];
544 }
545}
546
547
@ VectorAnyGeometry
Any vector layer with geometry.
Definition qgis.h:3533
@ Point
Point.
Definition qgis.h:279
@ Advanced
Parameter is an advanced parameter which should be hidden from users by default.
Definition qgis.h:3763
A vector of attributes.
Wrapper for iterator of features from vector data provider or vector layer.
bool nextFeature(QgsFeature &f)
Fetch next feature and stores in f, returns true on success.
Wraps a request for features to a vector layer (or directly its vector data provider).
@ FastInsert
Use faster inserts, at the cost of updating the passed features to reflect changes made at the provid...
The feature class encapsulates a single feature including its unique ID, geometry and a list of field...
Definition qgsfeature.h:58
QgsAttributes attributes
Definition qgsfeature.h:67
QgsFeatureId id
Definition qgsfeature.h:66
void setAttributes(const QgsAttributes &attrs)
Sets the feature's attributes.
QgsGeometry geometry
Definition qgsfeature.h:69
bool hasGeometry() const
Returns true if the feature has an associated geometry.
bool isCanceled() const
Tells whether the operation has been canceled already.
Definition qgsfeedback.h:53
void setProgress(double progress)
Sets the current progress for the feedback object.
Definition qgsfeedback.h:61
Encapsulate a field in an attribute table or data source.
Definition qgsfield.h:54
Container of fields for a vector layer.
Definition qgsfields.h:46
bool append(const QgsField &field, Qgis::FieldOrigin origin=Qgis::FieldOrigin::Provider, int originIndex=-1)
Appends a field.
Definition qgsfields.cpp:73
A geometry is the spatial representation of a feature.
const QgsAbstractGeometry * constGet() const
Returns a non-modifiable (const) reference to the underlying abstract geometry primitive.
QgsGeometry centroid() const
Returns the center of mass of a geometry.
Qgis::WkbType wkbType() const
Returns type of the geometry as a WKB type (point / linestring / polygon etc.).
Represents a 2D point.
Definition qgspointxy.h:60
Contains information about the context in which a processing algorithm is executed.
Custom exception class for processing related exceptions.
Base class for providing feedback from a processing algorithm.
virtual void pushInfo(const QString &info)
Pushes a general informational message from the algorithm.
virtual void pushWarning(const QString &warning)
Pushes a warning informational message from the algorithm.
virtual void reportError(const QString &error, bool fatalError=false)
Reports that the algorithm encountered an error while executing.
An enum based parameter for processing algorithms, allowing for selection from predefined values.
A feature sink output for processing algorithms.
An input feature source (such as vector layers) parameter for processing algorithms.
A numeric parameter for processing algorithms.
static QgsFields combineFields(const QgsFields &fieldsA, const QgsFields &fieldsB, const QString &fieldsBPrefix=QString())
Combines two field lists, avoiding duplicate field names (in a case-insensitive manner).
Represent a 2-dimensional vector.
Definition qgsvector.h:31
static Qgis::WkbType flatType(Qgis::WkbType type)
Returns the flat type for a WKB type.
bool qgsDoubleNear(double a, double b, double epsilon=4 *std::numeric_limits< double >::epsilon())
Compare two doubles (but allow some difference).
Definition qgis.h:6607
T qgsgeometry_cast(QgsAbstractGeometry *geom)