QGIS API Documentation 4.1.0-Master (ca2ac17535b)
Loading...
Searching...
No Matches
qgsalgorithmkmeansclustering.cpp
Go to the documentation of this file.
1/***************************************************************************
2 qgsalgorithmkmeansclustering.cpp
3 ---------------------
4 begin : June 2018
5 copyright : (C) 2018 by Nyall Dawson
6 email : nyall dot dawson at gmail dot com
7 ***************************************************************************/
8
9/***************************************************************************
10 * *
11 * This program is free software; you can redistribute it and/or modify *
12 * it under the terms of the GNU General Public License as published by *
13 * the Free Software Foundation; either version 2 of the License, or *
14 * (at your option) any later version. *
15 * *
16 ***************************************************************************/
17
19
20#include <random>
21#include <unordered_map>
22
23#include <QString>
24
25using namespace Qt::StringLiterals;
26
28
29constexpr uint KMEANS_MAX_ITERATIONS = 1000;
30
31QString QgsKMeansClusteringAlgorithm::name() const
32{
33 return u"kmeansclustering"_s;
34}
35
36QString QgsKMeansClusteringAlgorithm::displayName() const
37{
38 return QObject::tr( "K-means clustering" );
39}
40
41QStringList QgsKMeansClusteringAlgorithm::tags() const
42{
43 return QObject::tr( "clustering,clusters,kmeans,points" ).split( ',' );
44}
45
46QString QgsKMeansClusteringAlgorithm::group() const
47{
48 return QObject::tr( "Vector analysis" );
49}
50
51QString QgsKMeansClusteringAlgorithm::groupId() const
52{
53 return u"vectoranalysis"_s;
54}
55
56void QgsKMeansClusteringAlgorithm::initAlgorithm( const QVariantMap & )
57{
58 addParameter( new QgsProcessingParameterFeatureSource( u"INPUT"_s, QObject::tr( "Input layer" ), QList<int>() << static_cast<int>( Qgis::ProcessingSourceType::VectorAnyGeometry ) ) );
59 addParameter( new QgsProcessingParameterNumber( u"CLUSTERS"_s, QObject::tr( "Number of clusters" ), Qgis::ProcessingNumberParameterType::Integer, 5, false, 1 ) );
60
61 QStringList initializationMethods;
62 initializationMethods << QObject::tr( "Farthest points" ) << QObject::tr( "K-means++" );
63 addParameter( new QgsProcessingParameterEnum( u"METHOD"_s, QObject::tr( "Method" ), initializationMethods, false, 0, false ) );
64
65 auto fieldNameParam = std::make_unique<QgsProcessingParameterString>( u"FIELD_NAME"_s, QObject::tr( "Cluster field name" ), u"CLUSTER_ID"_s );
66 fieldNameParam->setFlags( fieldNameParam->flags() | Qgis::ProcessingParameterFlag::Advanced );
67 addParameter( fieldNameParam.release() );
68 auto sizeFieldNameParam = std::make_unique<QgsProcessingParameterString>( u"SIZE_FIELD_NAME"_s, QObject::tr( "Cluster size field name" ), u"CLUSTER_SIZE"_s );
69 sizeFieldNameParam->setFlags( sizeFieldNameParam->flags() | Qgis::ProcessingParameterFlag::Advanced );
70 addParameter( sizeFieldNameParam.release() );
71
72 addParameter( new QgsProcessingParameterFeatureSink( u"OUTPUT"_s, QObject::tr( "Clusters" ), Qgis::ProcessingSourceType::VectorAnyGeometry ) );
73}
74
75QString QgsKMeansClusteringAlgorithm::shortHelpString() const
76{
77 return QObject::tr(
78 "This algorithm calculates the 2D distance based k-means cluster number for each input feature.\n\n"
79 "If input geometries are lines or polygons, the clustering is based on the centroid of the feature.\n\n"
80 "References:\n"
81 "Arthur, David & Vassilvitskii, Sergei. (2007). K-Means++: The Advantages of Careful Seeding. Proc. of the Annu. ACM-SIAM Symp. on Discrete Algorithms. 8.\n\n"
82 "Bhattacharya, Anup & Eube, Jan & Röglin, Heiko & Schmidt, Melanie. (2019). Noisy, Greedy and Not So Greedy k-means++"
83 );
84}
85
86QString QgsKMeansClusteringAlgorithm::shortDescription() const
87{
88 return QObject::tr( "Calculates the 2D distance based k-means cluster number for each input feature." );
89}
90
91QgsKMeansClusteringAlgorithm *QgsKMeansClusteringAlgorithm::createInstance() const
92{
93 return new QgsKMeansClusteringAlgorithm();
94}
95
96QVariantMap QgsKMeansClusteringAlgorithm::processAlgorithm( const QVariantMap &parameters, QgsProcessingContext &context, QgsProcessingFeedback *feedback )
97{
98 std::unique_ptr<QgsProcessingFeatureSource> source( parameterAsSource( parameters, u"INPUT"_s, context ) );
99 if ( !source )
100 throw QgsProcessingException( invalidSourceError( parameters, u"INPUT"_s ) );
101
102 int k = parameterAsInt( parameters, u"CLUSTERS"_s, context );
103 int initializationMethod = parameterAsInt( parameters, u"METHOD"_s, context );
104
105 QgsFields outputFields = source->fields();
106 QgsFields newFields;
107 const QString clusterFieldName = parameterAsString( parameters, u"FIELD_NAME"_s, context );
108 newFields.append( QgsField( clusterFieldName, QMetaType::Type::Int ) );
109 const QString clusterSizeFieldName = parameterAsString( parameters, u"SIZE_FIELD_NAME"_s, context );
110 newFields.append( QgsField( clusterSizeFieldName, QMetaType::Type::Int ) );
111 outputFields = QgsProcessingUtils::combineFields( outputFields, newFields );
112
113 QString dest;
114 std::unique_ptr<QgsFeatureSink> sink( parameterAsSink( parameters, u"OUTPUT"_s, context, dest, outputFields, source->wkbType(), source->sourceCrs() ) );
115 if ( !sink )
116 throw QgsProcessingException( invalidSinkError( parameters, u"OUTPUT"_s ) );
117
118 // build list of point inputs - if it's already a point, use that. If not, take the centroid.
119 feedback->pushInfo( QObject::tr( "Collecting input points" ) );
120 const double step = source->featureCount() > 0 ? 50.0 / static_cast< double >( source->featureCount() ) : 1;
121 int i = 0;
122 int n = 0;
123 int featureWithGeometryCount = 0;
124 QgsFeature feat;
125
126 std::vector<Feature> clusterFeatures;
127 QgsFeatureIterator features = source->getFeatures( QgsFeatureRequest().setNoAttributes() );
128 QHash<QgsFeatureId, std::size_t> idToObj;
129 while ( features.nextFeature( feat ) )
130 {
131 i++;
132 if ( feedback->isCanceled() )
133 {
134 break;
135 }
136
137 feedback->setProgress( i * step );
138 if ( !feat.hasGeometry() )
139 continue;
140 featureWithGeometryCount++;
141
142 QgsPointXY point;
145 else
146 {
147 const QgsGeometry centroid = feat.geometry().centroid();
148 if ( centroid.isNull() )
149 continue; // centroid failed, e.g. empty linestring
150
152 }
153
154 n++;
155
156 idToObj[feat.id()] = clusterFeatures.size();
157 clusterFeatures.emplace_back( Feature( point ) );
158 }
159
160 if ( n < k )
161 {
162 feedback->reportError( QObject::tr( "Number of geometries is less than the number of clusters requested, not all clusters will get data" ) );
163 k = n;
164 }
165
166 if ( k > 1 )
167 {
168 feedback->pushInfo( QObject::tr( "Calculating clusters" ) );
169
170 // cluster centers
171 std::vector<QgsPointXY> centers( k );
172 switch ( initializationMethod )
173 {
174 case 0: // farthest points
175 initClustersFarthestPoints( clusterFeatures, centers, k, feedback );
176 break;
177 case 1: // k-means++
178 initClustersPlusPlus( clusterFeatures, centers, k, feedback );
179 break;
180 default:
181 break;
182 }
183 calculateKMeans( clusterFeatures, centers, k, feedback );
184 }
185
186 // cluster size
187 std::unordered_map<int, int> clusterSize;
188 for ( auto it = idToObj.constBegin(); it != idToObj.constEnd(); ++it )
189 {
190 clusterSize[clusterFeatures[it.value()].cluster]++;
191 }
192
193 features = source->getFeatures();
194 i = 0;
195 while ( features.nextFeature( feat ) )
196 {
197 i++;
198 if ( feedback->isCanceled() )
199 {
200 break;
201 }
202
203 feedback->setProgress( 50 + i * step );
204 QgsAttributes attr = feat.attributes();
205 const auto obj = idToObj.find( feat.id() );
206 if ( !feat.hasGeometry() || obj == idToObj.end() )
207 {
208 attr << QVariant() << QVariant();
209 }
210 else if ( k <= 1 )
211 {
212 attr << 0 << featureWithGeometryCount;
213 }
214 else
215 {
216 const int cluster = clusterFeatures[*obj].cluster;
217 attr << cluster << clusterSize[cluster];
218 }
219 feat.setAttributes( attr );
220 if ( !sink->addFeature( feat, QgsFeatureSink::FastInsert ) )
221 throw QgsProcessingException( writeFeatureError( sink.get(), parameters, u"OUTPUT"_s ) );
222 else
223 feedback->featureAddedToSink( u"OUTPUT"_s );
224 }
225
226 sink->finalize();
227 feedback->featureSinkFinalized( u"OUTPUT"_s );
228
229 QVariantMap outputs;
230 outputs.insert( u"OUTPUT"_s, dest );
231 return outputs;
232}
233
234// ported from https://github.com/postgis/postgis/blob/svn-trunk/liblwgeom/lwkmeans.c
235
236void QgsKMeansClusteringAlgorithm::initClustersFarthestPoints( std::vector<Feature> &points, std::vector<QgsPointXY> &centers, const int k, QgsProcessingFeedback *feedback )
237{
238 const std::size_t n = points.size();
239 if ( n == 0 )
240 return;
241
242 if ( n == 1 )
243 {
244 for ( int i = 0; i < k; i++ )
245 centers[i] = points[0].point;
246 return;
247 }
248
249 std::size_t duplicateCount = 1;
250 // initially scan for two most distance points from each other, p1 and p2
251 std::size_t p1 = 0;
252 std::size_t p2 = 0;
253 double distanceP1 = 0;
254 double distanceP2 = 0;
255 double maxDistance = -1;
256 for ( std::size_t i = 1; i < n; i++ )
257 {
258 distanceP1 = points[i].point.sqrDist( points[p1].point );
259 distanceP2 = points[i].point.sqrDist( points[p2].point );
260
261 // if this point is further then existing candidates, replace our choice
262 if ( ( distanceP1 > maxDistance ) || ( distanceP2 > maxDistance ) )
263 {
264 maxDistance = std::max( distanceP1, distanceP2 );
265 if ( distanceP1 > distanceP2 )
266 p2 = i;
267 else
268 p1 = i;
269 }
270
271 // also record count of duplicate points
272 if ( qgsDoubleNear( distanceP1, 0 ) || qgsDoubleNear( distanceP2, 0 ) )
273 duplicateCount++;
274 }
275
276 if ( feedback && duplicateCount > 1 )
277 {
278 feedback->pushWarning( QObject::tr( "There are at least %n duplicate input(s), the number of output clusters may be less than was requested", nullptr, static_cast< int >( duplicateCount ) ) );
279 }
280
281 // By now two points should be found and be not the same
282 // Q_ASSERT( p1 != p2 && maxDistance >= 0 );
283
284 // Accept these two points as our initial centers
285 centers[0] = points[p1].point;
286 centers[1] = points[p2].point;
287
288 if ( k > 2 )
289 {
290 // array of minimum distance to a point from accepted cluster centers
291 std::vector<double> distances( n );
292
293 // initialize array with distance to first object
294 for ( std::size_t j = 0; j < n; j++ )
295 {
296 distances[j] = points[j].point.sqrDist( centers[0] );
297 }
298 distances[p1] = -1;
299 distances[p2] = -1;
300
301 // loop i on clusters, skip 0 and 1 as found already
302 for ( int i = 2; i < k; i++ )
303 {
304 std::size_t candidateCenter = 0;
305 double maxDistance = std::numeric_limits<double>::lowest();
306
307 // loop j on points
308 for ( std::size_t j = 0; j < n; j++ )
309 {
310 // accepted clusters are already marked with distance = -1
311 if ( distances[j] < 0 )
312 continue;
313
314 // update minimal distance with previously accepted cluster
315 distances[j] = std::min( points[j].point.sqrDist( centers[i - 1] ), distances[j] );
316
317 // greedily take a point that's farthest from any of accepted clusters
318 if ( distances[j] > maxDistance )
319 {
320 candidateCenter = j;
321 maxDistance = distances[j];
322 }
323 }
324
325 // checked earlier by counting entries on input, just in case
326 Q_ASSERT( maxDistance >= 0 );
327
328 // accept candidate to centers
329 distances[candidateCenter] = -1;
330 // copy the point coordinates into the initial centers array
331 centers[i] = points[candidateCenter].point;
332 }
333 }
334}
335
336void QgsKMeansClusteringAlgorithm::initClustersPlusPlus( std::vector<Feature> &points, std::vector<QgsPointXY> &centers, const int k, QgsProcessingFeedback *feedback )
337{
338 const std::size_t n = points.size();
339 if ( n == 0 )
340 return;
341
342 if ( n == 1 )
343 {
344 for ( int i = 0; i < k; i++ )
345 centers[i] = points[0].point;
346 return;
347 }
348
349 // randomly select the first point
350 std::random_device rd;
351 std::mt19937 gen( rd() );
352 std::uniform_int_distribution<size_t> distrib( 0, n - 1 );
353
354 std::size_t p1 = distrib( gen );
355 centers[0] = points[p1].point;
356
357 // calculate distances and total error (sum of distances of points to center)
358 std::vector<double> distances( n );
359 double totalError = 0;
360 std::size_t duplicateCount = 1;
361 for ( size_t i = 0; i < n; i++ )
362 {
363 double distance = points[i].point.sqrDist( centers[0] );
364 distances[i] = distance;
365 totalError += distance;
366 if ( qgsDoubleNear( distance, 0 ) )
367 {
368 duplicateCount++;
369 }
370 }
371 if ( feedback && duplicateCount > 1 )
372 {
373 feedback->pushWarning( QObject::tr( "There are at least %n duplicate input(s), the number of output clusters may be less than was requested", nullptr, static_cast< int >( duplicateCount ) ) );
374 }
375
376 // greedy kmeans++
377 // test not only one center but L possible centers
378 // chosen independently according to the same probability distribution), and then among these L
379 // centers, the one that decreases the k-means cost the most is chosen
380 // Bhattacharya, Anup & Eube, Jan & Röglin, Heiko & Schmidt, Melanie. (2019). Noisy, greedy and Not So greedy k-means++
381 unsigned int numCandidateCenters = 2 + static_cast< int >( std::floor( std::log( k ) ) );
382 std::vector<double> randomNumbers( numCandidateCenters );
383 std::vector<size_t> candidateCenters( numCandidateCenters );
384
385 std::uniform_real_distribution<double> dis( 0.0, 1.0 );
386 for ( int i = 1; i < k; i++ )
387 {
388 // sampling with probability proportional to the squared distance to the closest existing center
389 for ( unsigned int j = 0; j < numCandidateCenters; j++ )
390 {
391 randomNumbers[j] = dis( gen ) * totalError;
392 }
393
394 // cumulative sum, keep distances for later use
395 std::vector<double> cumSum = distances;
396 for ( size_t j = 1; j < n; j++ )
397 {
398 cumSum[j] += cumSum[j - 1];
399 }
400
401 // binary search for the index of the first element greater than or equal to random numbers
402 for ( unsigned int j = 0; j < numCandidateCenters; j++ )
403 {
404 size_t low = 0;
405 size_t high = n - 1;
406
407 while ( low <= high )
408 {
409 size_t mid = low + ( high - low ) / 2;
410 if ( cumSum[mid] < randomNumbers[j] )
411 {
412 low = mid + 1;
413 }
414 else
415 {
416 // size_t cannot be negative
417 if ( mid == 0 )
418 break;
419
420 high = mid - 1;
421 }
422 }
423 // clip candidate center to the number of points
424 if ( low >= n )
425 {
426 low = n - 1;
427 }
428 candidateCenters[j] = low;
429 }
430
431 std::vector<std::vector<double>> distancesCandidateCenters( numCandidateCenters, std::vector<double>( n ) );
432 ;
433
434 // store distances between previous and new candidate center, error and best candidate index
435 double currentError = 0;
436 double lowestError = std::numeric_limits<double>::max();
437 unsigned int bestCandidateIndex = 0;
438 for ( unsigned int j = 0; j < numCandidateCenters; j++ )
439 {
440 for ( size_t z = 0; z < n; z++ )
441 {
442 // distance to candidate center
443 double distance = points[candidateCenters[j]].point.sqrDist( points[z].point );
444 // if distance to previous center is less than the current distance, use that
445 if ( distance > distances[z] )
446 {
447 distance = distances[z];
448 }
449 distancesCandidateCenters[j][z] = distance;
450 currentError += distance;
451 }
452 if ( lowestError > currentError )
453 {
454 lowestError = currentError;
455 bestCandidateIndex = j;
456 }
457 }
458
459 // update distances with the best candidate center values
460 for ( size_t j = 0; j < n; j++ )
461 {
462 distances[j] = distancesCandidateCenters[bestCandidateIndex][j];
463 }
464 // store the best candidate center
465 centers[i] = points[candidateCenters[bestCandidateIndex]].point;
466 // update error
467 totalError = lowestError;
468 }
469}
470
471// ported from https://github.com/postgis/postgis/blob/svn-trunk/liblwgeom/lwkmeans.c
472
473void QgsKMeansClusteringAlgorithm::calculateKMeans( std::vector<QgsKMeansClusteringAlgorithm::Feature> &objs, std::vector<QgsPointXY> &centers, int k, QgsProcessingFeedback *feedback )
474{
475 int converged = false;
476 bool changed = false;
477
478 // avoid reallocating weights array for every iteration
479 std::vector<uint> weights( k );
480
481 uint i = 0;
482 for ( i = 0; i < KMEANS_MAX_ITERATIONS && !converged; i++ )
483 {
484 if ( feedback && feedback->isCanceled() )
485 break;
486
487 findNearest( objs, centers, k, changed );
488 updateMeans( objs, centers, weights, k );
489 converged = !changed;
490 }
491
492 if ( !converged && feedback )
493 feedback->reportError( QObject::tr( "Clustering did not converge after %n iteration(s)", nullptr, static_cast<int>( i ) ) );
494 else if ( feedback )
495 feedback->pushInfo( QObject::tr( "Clustering converged after %n iteration(s)", nullptr, static_cast<int>( i ) ) );
496}
497
498// ported from https://github.com/postgis/postgis/blob/svn-trunk/liblwgeom/lwkmeans.c
499
500void QgsKMeansClusteringAlgorithm::findNearest( std::vector<QgsKMeansClusteringAlgorithm::Feature> &points, const std::vector<QgsPointXY> &centers, const int k, bool &changed )
501{
502 changed = false;
503 const std::size_t n = points.size();
504 for ( std::size_t i = 0; i < n; i++ )
505 {
506 Feature &point = points[i];
507
508 // Initialize with distance to first cluster
509 double currentDistance = point.point.sqrDist( centers[0] );
510 int currentCluster = 0;
511
512 // Check all other cluster centers and find the nearest
513 for ( int cluster = 1; cluster < k; cluster++ )
514 {
515 const double distance = point.point.sqrDist( centers[cluster] );
516 if ( distance < currentDistance )
517 {
518 currentDistance = distance;
519 currentCluster = cluster;
520 }
521 }
522
523 // Store the nearest cluster this object is in
524 if ( point.cluster != currentCluster )
525 {
526 changed = true;
527 point.cluster = currentCluster;
528 }
529 }
530}
531
532// ported from https://github.com/postgis/postgis/blob/svn-trunk/liblwgeom/lwkmeans.c
533
534void QgsKMeansClusteringAlgorithm::updateMeans( const std::vector<Feature> &points, std::vector<QgsPointXY> &centers, std::vector<uint> &weights, const int k )
535{
536 const uint n = points.size();
537 std::fill( weights.begin(), weights.end(), 0 );
538 for ( int i = 0; i < k; i++ )
539 {
540 centers[i].setX( 0.0 );
541 centers[i].setY( 0.0 );
542 }
543 for ( uint i = 0; i < n; i++ )
544 {
545 const int cluster = points[i].cluster;
546 centers[cluster] += QgsVector( points[i].point.x(), points[i].point.y() );
547 weights[cluster] += 1;
548 }
549 for ( int i = 0; i < k; i++ )
550 {
551 centers[i] /= weights[i];
552 }
553}
554
555
@ VectorAnyGeometry
Any vector layer with geometry.
Definition qgis.h:3714
@ Point
Point.
Definition qgis.h:296
@ Advanced
Parameter is an advanced parameter which should be hidden from users by default.
Definition qgis.h:3947
A vector of attributes.
Wrapper for iterator of features from vector data provider or vector layer.
bool nextFeature(QgsFeature &f)
Fetch next feature and stores in f, returns true on success.
Wraps a request for features to a vector layer (or directly its vector data provider).
@ FastInsert
Use faster inserts, at the cost of updating the passed features to reflect changes made at the provid...
The feature class encapsulates a single feature including its unique ID, geometry and a list of field...
Definition qgsfeature.h:60
QgsAttributes attributes
Definition qgsfeature.h:64
QgsFeatureId id
Definition qgsfeature.h:63
void setAttributes(const QgsAttributes &attrs)
Sets the feature's attributes.
QgsGeometry geometry
Definition qgsfeature.h:66
bool hasGeometry() const
Returns true if the feature has an associated geometry.
bool isCanceled() const
Tells whether the operation has been canceled already.
Definition qgsfeedback.h:56
void setProgress(double progress)
Sets the current progress for the feedback object.
Definition qgsfeedback.h:65
Encapsulate a field in an attribute table or data source.
Definition qgsfield.h:56
Container of fields for a vector layer.
Definition qgsfields.h:46
bool append(const QgsField &field, Qgis::FieldOrigin origin=Qgis::FieldOrigin::Provider, int originIndex=-1)
Appends a field.
Definition qgsfields.cpp:75
A geometry is the spatial representation of a feature.
const QgsAbstractGeometry * constGet() const
Returns a non-modifiable (const) reference to the underlying abstract geometry primitive.
QgsGeometry centroid() const
Returns the center of mass of a geometry.
Qgis::WkbType wkbType() const
Returns type of the geometry as a WKB type (point / linestring / polygon etc.).
Represents a 2D point.
Definition qgspointxy.h:62
Contains information about the context in which a processing algorithm is executed.
Custom exception class for processing related exceptions.
Base class for providing feedback from a processing algorithm.
void featureAddedToSink(const QString &output)
Reports that a feature was added to the the sink associated with the specified algorithm output.
virtual void pushInfo(const QString &info)
Pushes a general informational message from the algorithm.
virtual void pushWarning(const QString &warning)
Pushes a warning informational message from the algorithm.
void featureSinkFinalized(const QString &output)
Reports that a feature sink has been finalized.
virtual void reportError(const QString &error, bool fatalError=false)
Reports that the algorithm encountered an error while executing.
An enum based parameter for processing algorithms, allowing for selection from predefined values.
A feature sink output for processing algorithms.
An input feature source (such as vector layers) parameter for processing algorithms.
A numeric parameter for processing algorithms.
static QgsFields combineFields(const QgsFields &fieldsA, const QgsFields &fieldsB, const QString &fieldsBPrefix=QString())
Combines two field lists, avoiding duplicate field names (in a case-insensitive manner).
Represent a 2-dimensional vector.
Definition qgsvector.h:34
static Qgis::WkbType flatType(Qgis::WkbType type)
Returns the flat type for a WKB type.
bool qgsDoubleNear(double a, double b, double epsilon=4 *std::numeric_limits< double >::epsilon())
Compare two doubles (but allow some difference).
Definition qgis.h:7236
T qgsgeometry_cast(QgsAbstractGeometry *geom)