QGIS API Documentation 3.43.0-Master (e01d6d7c4c0)
qgsalgorithmkmeansclustering.cpp
Go to the documentation of this file.
1/***************************************************************************
2 qgsalgorithmkmeansclustering.cpp
3 ---------------------
4 begin : June 2018
5 copyright : (C) 2018 by Nyall Dawson
6 email : nyall dot dawson at gmail dot com
7 ***************************************************************************/
8
9/***************************************************************************
10 * *
11 * This program is free software; you can redistribute it and/or modify *
12 * it under the terms of the GNU General Public License as published by *
13 * the Free Software Foundation; either version 2 of the License, or *
14 * (at your option) any later version. *
15 * *
16 ***************************************************************************/
17
19#include <unordered_map>
20
22
23const int KMEANS_MAX_ITERATIONS = 1000;
24
25QString QgsKMeansClusteringAlgorithm::name() const
26{
27 return QStringLiteral( "kmeansclustering" );
28}
29
30QString QgsKMeansClusteringAlgorithm::displayName() const
31{
32 return QObject::tr( "K-means clustering" );
33}
34
35QStringList QgsKMeansClusteringAlgorithm::tags() const
36{
37 return QObject::tr( "clustering,clusters,kmeans,points" ).split( ',' );
38}
39
40QString QgsKMeansClusteringAlgorithm::group() const
41{
42 return QObject::tr( "Vector analysis" );
43}
44
45QString QgsKMeansClusteringAlgorithm::groupId() const
46{
47 return QStringLiteral( "vectoranalysis" );
48}
49
50void QgsKMeansClusteringAlgorithm::initAlgorithm( const QVariantMap & )
51{
52 addParameter( new QgsProcessingParameterFeatureSource( QStringLiteral( "INPUT" ), QObject::tr( "Input layer" ), QList<int>() << static_cast<int>( Qgis::ProcessingSourceType::VectorAnyGeometry ) ) );
53 addParameter( new QgsProcessingParameterNumber( QStringLiteral( "CLUSTERS" ), QObject::tr( "Number of clusters" ), Qgis::ProcessingNumberParameterType::Integer, 5, false, 1 ) );
54
55 auto fieldNameParam = std::make_unique<QgsProcessingParameterString>( QStringLiteral( "FIELD_NAME" ), QObject::tr( "Cluster field name" ), QStringLiteral( "CLUSTER_ID" ) );
56 fieldNameParam->setFlags( fieldNameParam->flags() | Qgis::ProcessingParameterFlag::Advanced );
57 addParameter( fieldNameParam.release() );
58 auto sizeFieldNameParam = std::make_unique<QgsProcessingParameterString>( QStringLiteral( "SIZE_FIELD_NAME" ), QObject::tr( "Cluster size field name" ), QStringLiteral( "CLUSTER_SIZE" ) );
59 sizeFieldNameParam->setFlags( sizeFieldNameParam->flags() | Qgis::ProcessingParameterFlag::Advanced );
60 addParameter( sizeFieldNameParam.release() );
61
62 addParameter( new QgsProcessingParameterFeatureSink( QStringLiteral( "OUTPUT" ), QObject::tr( "Clusters" ), Qgis::ProcessingSourceType::VectorAnyGeometry ) );
63}
64
65QString QgsKMeansClusteringAlgorithm::shortHelpString() const
66{
67 return QObject::tr( "This algorithm calculates the 2D distance based k-means cluster number for each input feature.\n\n"
68 "If input geometries are lines or polygons, the clustering is based on the centroid of the feature." );
69}
70
71QString QgsKMeansClusteringAlgorithm::shortDescription() const
72{
73 return QObject::tr( "Calculates the 2D distance based k-means cluster number for each input feature." );
74}
75
76QgsKMeansClusteringAlgorithm *QgsKMeansClusteringAlgorithm::createInstance() const
77{
78 return new QgsKMeansClusteringAlgorithm();
79}
80
81QVariantMap QgsKMeansClusteringAlgorithm::processAlgorithm( const QVariantMap &parameters, QgsProcessingContext &context, QgsProcessingFeedback *feedback )
82{
83 std::unique_ptr<QgsProcessingFeatureSource> source( parameterAsSource( parameters, QStringLiteral( "INPUT" ), context ) );
84 if ( !source )
85 throw QgsProcessingException( invalidSourceError( parameters, QStringLiteral( "INPUT" ) ) );
86
87 int k = parameterAsInt( parameters, QStringLiteral( "CLUSTERS" ), context );
88
89 QgsFields outputFields = source->fields();
90 QgsFields newFields;
91 const QString clusterFieldName = parameterAsString( parameters, QStringLiteral( "FIELD_NAME" ), context );
92 newFields.append( QgsField( clusterFieldName, QMetaType::Type::Int ) );
93 const QString clusterSizeFieldName = parameterAsString( parameters, QStringLiteral( "SIZE_FIELD_NAME" ), context );
94 newFields.append( QgsField( clusterSizeFieldName, QMetaType::Type::Int ) );
95 outputFields = QgsProcessingUtils::combineFields( outputFields, newFields );
96
97 QString dest;
98 std::unique_ptr<QgsFeatureSink> sink( parameterAsSink( parameters, QStringLiteral( "OUTPUT" ), context, dest, outputFields, source->wkbType(), source->sourceCrs() ) );
99 if ( !sink )
100 throw QgsProcessingException( invalidSinkError( parameters, QStringLiteral( "OUTPUT" ) ) );
101
102 // build list of point inputs - if it's already a point, use that. If not, take the centroid.
103 feedback->pushInfo( QObject::tr( "Collecting input points" ) );
104 const double step = source->featureCount() > 0 ? 50.0 / source->featureCount() : 1;
105 int i = 0;
106 int n = 0;
107 int featureWithGeometryCount = 0;
108 QgsFeature feat;
109
110 std::vector<Feature> clusterFeatures;
111 QgsFeatureIterator features = source->getFeatures( QgsFeatureRequest().setNoAttributes() );
112 QHash<QgsFeatureId, int> idToObj;
113 while ( features.nextFeature( feat ) )
114 {
115 i++;
116 if ( feedback->isCanceled() )
117 {
118 break;
119 }
120
121 feedback->setProgress( i * step );
122 if ( !feat.hasGeometry() )
123 continue;
124 featureWithGeometryCount++;
125
126 QgsPointXY point;
128 point = QgsPointXY( *qgsgeometry_cast<const QgsPoint *>( feat.geometry().constGet() ) );
129 else
130 {
131 const QgsGeometry centroid = feat.geometry().centroid();
132 if ( centroid.isNull() )
133 continue; // centroid failed, e.g. empty linestring
134
135 point = QgsPointXY( *qgsgeometry_cast<const QgsPoint *>( centroid.constGet() ) );
136 }
137
138 n++;
139
140 idToObj[feat.id()] = clusterFeatures.size();
141 clusterFeatures.emplace_back( Feature( point ) );
142 }
143
144 if ( n < k )
145 {
146 feedback->reportError( QObject::tr( "Number of geometries is less than the number of clusters requested, not all clusters will get data" ) );
147 k = n;
148 }
149
150 if ( k > 1 )
151 {
152 feedback->pushInfo( QObject::tr( "Calculating clusters" ) );
153
154 // cluster centers
155 std::vector<QgsPointXY> centers( k );
156
157 initClusters( clusterFeatures, centers, k, feedback );
158 calculateKMeans( clusterFeatures, centers, k, feedback );
159 }
160
161 // cluster size
162 std::unordered_map<int, int> clusterSize;
163 for ( const int obj : idToObj )
164 clusterSize[clusterFeatures[obj].cluster]++;
165
166 features = source->getFeatures();
167 i = 0;
168 while ( features.nextFeature( feat ) )
169 {
170 i++;
171 if ( feedback->isCanceled() )
172 {
173 break;
174 }
175
176 feedback->setProgress( 50 + i * step );
177 QgsAttributes attr = feat.attributes();
178 const auto obj = idToObj.find( feat.id() );
179 if ( !feat.hasGeometry() || obj == idToObj.end() )
180 {
181 attr << QVariant() << QVariant();
182 }
183 else if ( k <= 1 )
184 {
185 attr << 0 << featureWithGeometryCount;
186 }
187 else
188 {
189 const int cluster = clusterFeatures[*obj].cluster;
190 attr << cluster << clusterSize[cluster];
191 }
192 feat.setAttributes( attr );
193 if ( !sink->addFeature( feat, QgsFeatureSink::FastInsert ) )
194 throw QgsProcessingException( writeFeatureError( sink.get(), parameters, QStringLiteral( "OUTPUT" ) ) );
195 }
196
197 sink->finalize();
198
199 QVariantMap outputs;
200 outputs.insert( QStringLiteral( "OUTPUT" ), dest );
201 return outputs;
202}
203
204// ported from https://github.com/postgis/postgis/blob/svn-trunk/liblwgeom/lwkmeans.c
205
206void QgsKMeansClusteringAlgorithm::initClusters( std::vector<Feature> &points, std::vector<QgsPointXY> &centers, const int k, QgsProcessingFeedback *feedback )
207{
208 const std::size_t n = points.size();
209 if ( n == 0 )
210 return;
211
212 if ( n == 1 )
213 {
214 for ( int i = 0; i < k; i++ )
215 centers[i] = points[0].point;
216 return;
217 }
218
219 long duplicateCount = 1;
220 // initially scan for two most distance points from each other, p1 and p2
221 std::size_t p1 = 0;
222 std::size_t p2 = 0;
223 double distanceP1 = 0;
224 double distanceP2 = 0;
225 double maxDistance = -1;
226 for ( std::size_t i = 1; i < n; i++ )
227 {
228 distanceP1 = points[i].point.sqrDist( points[p1].point );
229 distanceP2 = points[i].point.sqrDist( points[p2].point );
230
231 // if this point is further then existing candidates, replace our choice
232 if ( ( distanceP1 > maxDistance ) || ( distanceP2 > maxDistance ) )
233 {
234 maxDistance = std::max( distanceP1, distanceP2 );
235 if ( distanceP1 > distanceP2 )
236 p2 = i;
237 else
238 p1 = i;
239 }
240
241 // also record count of duplicate points
242 if ( qgsDoubleNear( distanceP1, 0 ) || qgsDoubleNear( distanceP2, 0 ) )
243 duplicateCount++;
244 }
245
246 if ( feedback && duplicateCount > 1 )
247 {
248 feedback->pushInfo( QObject::tr( "There are at least %n duplicate input(s), the number of output clusters may be less than was requested", nullptr, duplicateCount ) );
249 }
250
251 // By now two points should be found and be not the same
252 // Q_ASSERT( p1 != p2 && maxDistance >= 0 );
253
254 // Accept these two points as our initial centers
255 centers[0] = points[p1].point;
256 centers[1] = points[p2].point;
257
258 if ( k > 2 )
259 {
260 // array of minimum distance to a point from accepted cluster centers
261 std::vector<double> distances( n );
262
263 // initialize array with distance to first object
264 for ( std::size_t j = 0; j < n; j++ )
265 {
266 distances[j] = points[j].point.sqrDist( centers[0] );
267 }
268 distances[p1] = -1;
269 distances[p2] = -1;
270
271 // loop i on clusters, skip 0 and 1 as found already
272 for ( int i = 2; i < k; i++ )
273 {
274 std::size_t candidateCenter = 0;
275 double maxDistance = std::numeric_limits<double>::lowest();
276
277 // loop j on points
278 for ( std::size_t j = 0; j < n; j++ )
279 {
280 // accepted clusters are already marked with distance = -1
281 if ( distances[j] < 0 )
282 continue;
283
284 // update minimal distance with previously accepted cluster
285 distances[j] = std::min( points[j].point.sqrDist( centers[i - 1] ), distances[j] );
286
287 // greedily take a point that's farthest from any of accepted clusters
288 if ( distances[j] > maxDistance )
289 {
290 candidateCenter = j;
291 maxDistance = distances[j];
292 }
293 }
294
295 // checked earlier by counting entries on input, just in case
296 Q_ASSERT( maxDistance >= 0 );
297
298 // accept candidate to centers
299 distances[candidateCenter] = -1;
300 // copy the point coordinates into the initial centers array
301 centers[i] = points[candidateCenter].point;
302 }
303 }
304}
305
306// ported from https://github.com/postgis/postgis/blob/svn-trunk/liblwgeom/lwkmeans.c
307
308void QgsKMeansClusteringAlgorithm::calculateKMeans( std::vector<QgsKMeansClusteringAlgorithm::Feature> &objs, std::vector<QgsPointXY> &centers, int k, QgsProcessingFeedback *feedback )
309{
310 int converged = false;
311 bool changed = false;
312
313 // avoid reallocating weights array for every iteration
314 std::vector<uint> weights( k );
315
316 uint i = 0;
317 for ( i = 0; i < KMEANS_MAX_ITERATIONS && !converged; i++ )
318 {
319 if ( feedback && feedback->isCanceled() )
320 break;
321
322 findNearest( objs, centers, k, changed );
323 updateMeans( objs, centers, weights, k );
324 converged = !changed;
325 }
326
327 if ( !converged && feedback )
328 feedback->reportError( QObject::tr( "Clustering did not converge after %n iteration(s)", nullptr, i ) );
329 else if ( feedback )
330 feedback->pushInfo( QObject::tr( "Clustering converged after %n iteration(s)", nullptr, i ) );
331}
332
333// ported from https://github.com/postgis/postgis/blob/svn-trunk/liblwgeom/lwkmeans.c
334
335void QgsKMeansClusteringAlgorithm::findNearest( std::vector<QgsKMeansClusteringAlgorithm::Feature> &points, const std::vector<QgsPointXY> &centers, const int k, bool &changed )
336{
337 changed = false;
338 const std::size_t n = points.size();
339 for ( std::size_t i = 0; i < n; i++ )
340 {
341 Feature &point = points[i];
342
343 // Initialize with distance to first cluster
344 double currentDistance = point.point.sqrDist( centers[0] );
345 int currentCluster = 0;
346
347 // Check all other cluster centers and find the nearest
348 for ( int cluster = 1; cluster < k; cluster++ )
349 {
350 const double distance = point.point.sqrDist( centers[cluster] );
351 if ( distance < currentDistance )
352 {
353 currentDistance = distance;
354 currentCluster = cluster;
355 }
356 }
357
358 // Store the nearest cluster this object is in
359 if ( point.cluster != currentCluster )
360 {
361 changed = true;
362 point.cluster = currentCluster;
363 }
364 }
365}
366
367// ported from https://github.com/postgis/postgis/blob/svn-trunk/liblwgeom/lwkmeans.c
368
369void QgsKMeansClusteringAlgorithm::updateMeans( const std::vector<Feature> &points, std::vector<QgsPointXY> &centers, std::vector<uint> &weights, const int k )
370{
371 const uint n = points.size();
372 std::fill( weights.begin(), weights.end(), 0 );
373 for ( int i = 0; i < k; i++ )
374 {
375 centers[i].setX( 0.0 );
376 centers[i].setY( 0.0 );
377 }
378 for ( uint i = 0; i < n; i++ )
379 {
380 const int cluster = points[i].cluster;
381 centers[cluster] += QgsVector( points[i].point.x(), points[i].point.y() );
382 weights[cluster] += 1;
383 }
384 for ( int i = 0; i < k; i++ )
385 {
386 centers[i] /= weights[i];
387 }
388}
389
390
@ VectorAnyGeometry
Any vector layer with geometry.
@ Advanced
Parameter is an advanced parameter which should be hidden from users by default.
A vector of attributes.
Wrapper for iterator of features from vector data provider or vector layer.
bool nextFeature(QgsFeature &f)
Fetch next feature and stores in f, returns true on success.
Wraps a request for features to a vector layer (or directly its vector data provider).
@ FastInsert
Use faster inserts, at the cost of updating the passed features to reflect changes made at the provid...
The feature class encapsulates a single feature including its unique ID, geometry and a list of field...
Definition qgsfeature.h:58
QgsAttributes attributes
Definition qgsfeature.h:67
QgsFeatureId id
Definition qgsfeature.h:66
void setAttributes(const QgsAttributes &attrs)
Sets the feature's attributes.
QgsGeometry geometry
Definition qgsfeature.h:69
bool hasGeometry() const
Returns true if the feature has an associated geometry.
bool isCanceled() const
Tells whether the operation has been canceled already.
Definition qgsfeedback.h:53
void setProgress(double progress)
Sets the current progress for the feedback object.
Definition qgsfeedback.h:61
Encapsulate a field in an attribute table or data source.
Definition qgsfield.h:53
Container of fields for a vector layer.
Definition qgsfields.h:46
bool append(const QgsField &field, Qgis::FieldOrigin origin=Qgis::FieldOrigin::Provider, int originIndex=-1)
Appends a field.
Definition qgsfields.cpp:70
A geometry is the spatial representation of a feature.
const QgsAbstractGeometry * constGet() const
Returns a non-modifiable (const) reference to the underlying abstract geometry primitive.
QgsGeometry centroid() const
Returns the center of mass of a geometry.
Qgis::WkbType wkbType() const
Returns type of the geometry as a WKB type (point / linestring / polygon etc.)
Represents a 2D point.
Definition qgspointxy.h:60
Contains information about the context in which a processing algorithm is executed.
Custom exception class for processing related exceptions.
Base class for providing feedback from a processing algorithm.
virtual void pushInfo(const QString &info)
Pushes a general informational message from the algorithm.
virtual void reportError(const QString &error, bool fatalError=false)
Reports that the algorithm encountered an error while executing.
A feature sink output for processing algorithms.
An input feature source (such as vector layers) parameter for processing algorithms.
A numeric parameter for processing algorithms.
static QgsFields combineFields(const QgsFields &fieldsA, const QgsFields &fieldsB, const QString &fieldsBPrefix=QString())
Combines two field lists, avoiding duplicate field names (in a case-insensitive manner).
Represent a 2-dimensional vector.
Definition qgsvector.h:30
static Qgis::WkbType flatType(Qgis::WkbType type)
Returns the flat type for a WKB type.
bool qgsDoubleNear(double a, double b, double epsilon=4 *std::numeric_limits< double >::epsilon())
Compare two doubles (but allow some difference)
Definition qgis.h:6302