QGIS API Documentation  3.22.4-Białowieża (ce8e65e95e)
qgsalgorithmkmeansclustering.cpp
Go to the documentation of this file.
1 /***************************************************************************
2  qgsalgorithmkmeansclustering.cpp
3  ---------------------
4  begin : June 2018
5  copyright : (C) 2018 by Nyall Dawson
6  email : nyall dot dawson at gmail dot com
7  ***************************************************************************/
8 
9 /***************************************************************************
10  * *
11  * This program is free software; you can redistribute it and/or modify *
12  * it under the terms of the GNU General Public License as published by *
13  * the Free Software Foundation; either version 2 of the License, or *
14  * (at your option) any later version. *
15  * *
16  ***************************************************************************/
17 
19 #include <unordered_map>
20 
22 
23 const int KMEANS_MAX_ITERATIONS = 1000;
24 
25 QString QgsKMeansClusteringAlgorithm::name() const
26 {
27  return QStringLiteral( "kmeansclustering" );
28 }
29 
30 QString QgsKMeansClusteringAlgorithm::displayName() const
31 {
32  return QObject::tr( "K-means clustering" );
33 }
34 
35 QStringList QgsKMeansClusteringAlgorithm::tags() const
36 {
37  return QObject::tr( "clustering,clusters,kmeans,points" ).split( ',' );
38 }
39 
40 QString QgsKMeansClusteringAlgorithm::group() const
41 {
42  return QObject::tr( "Vector analysis" );
43 }
44 
45 QString QgsKMeansClusteringAlgorithm::groupId() const
46 {
47  return QStringLiteral( "vectoranalysis" );
48 }
49 
50 void QgsKMeansClusteringAlgorithm::initAlgorithm( const QVariantMap & )
51 {
52  addParameter( new QgsProcessingParameterFeatureSource( QStringLiteral( "INPUT" ),
53  QObject::tr( "Input layer" ), QList< int >() << QgsProcessing::TypeVectorAnyGeometry ) );
54  addParameter( new QgsProcessingParameterNumber( QStringLiteral( "CLUSTERS" ), QObject::tr( "Number of clusters" ),
56 
57  auto fieldNameParam = std::make_unique<QgsProcessingParameterString>( QStringLiteral( "FIELD_NAME" ),
58  QObject::tr( "Cluster field name" ), QStringLiteral( "CLUSTER_ID" ) );
59  fieldNameParam->setFlags( fieldNameParam->flags() | QgsProcessingParameterDefinition::FlagAdvanced );
60  addParameter( fieldNameParam.release() );
61  auto sizeFieldNameParam = std::make_unique<QgsProcessingParameterString>( QStringLiteral( "SIZE_FIELD_NAME" ),
62  QObject::tr( "Cluster size field name" ), QStringLiteral( "CLUSTER_SIZE" ) );
63  sizeFieldNameParam->setFlags( sizeFieldNameParam->flags() | QgsProcessingParameterDefinition::FlagAdvanced );
64  addParameter( sizeFieldNameParam.release() );
65 
66  addParameter( new QgsProcessingParameterFeatureSink( QStringLiteral( "OUTPUT" ), QObject::tr( "Clusters" ), QgsProcessing::TypeVectorAnyGeometry ) );
67 }
68 
69 QString QgsKMeansClusteringAlgorithm::shortHelpString() const
70 {
71  return QObject::tr( "Calculates the 2D distance based k-means cluster number for each input feature.\n\n"
72  "If input geometries are lines or polygons, the clustering is based on the centroid of the feature." );
73 }
74 
75 QgsKMeansClusteringAlgorithm *QgsKMeansClusteringAlgorithm::createInstance() const
76 {
77  return new QgsKMeansClusteringAlgorithm();
78 }
79 
80 QVariantMap QgsKMeansClusteringAlgorithm::processAlgorithm( const QVariantMap &parameters, QgsProcessingContext &context, QgsProcessingFeedback *feedback )
81 {
82  std::unique_ptr< QgsProcessingFeatureSource > source( parameterAsSource( parameters, QStringLiteral( "INPUT" ), context ) );
83  if ( !source )
84  throw QgsProcessingException( invalidSourceError( parameters, QStringLiteral( "INPUT" ) ) );
85 
86  int k = parameterAsInt( parameters, QStringLiteral( "CLUSTERS" ), context );
87 
88  QgsFields outputFields = source->fields();
89  QgsFields newFields;
90  const QString clusterFieldName = parameterAsString( parameters, QStringLiteral( "FIELD_NAME" ), context );
91  newFields.append( QgsField( clusterFieldName, QVariant::Int ) );
92  const QString clusterSizeFieldName = parameterAsString( parameters, QStringLiteral( "SIZE_FIELD_NAME" ), context );
93  newFields.append( QgsField( clusterSizeFieldName, QVariant::Int ) );
94  outputFields = QgsProcessingUtils::combineFields( outputFields, newFields );
95 
96  QString dest;
97  std::unique_ptr< QgsFeatureSink > sink( parameterAsSink( parameters, QStringLiteral( "OUTPUT" ), context, dest, outputFields, source->wkbType(), source->sourceCrs() ) );
98  if ( !sink )
99  throw QgsProcessingException( invalidSinkError( parameters, QStringLiteral( "OUTPUT" ) ) );
100 
101  // build list of point inputs - if it's already a point, use that. If not, take the centroid.
102  feedback->pushInfo( QObject::tr( "Collecting input points" ) );
103  const double step = source->featureCount() > 0 ? 50.0 / source->featureCount() : 1;
104  int i = 0;
105  int n = 0;
106  int featureWithGeometryCount = 0;
107  QgsFeature feat;
108 
109  std::vector< Feature > clusterFeatures;
110  QgsFeatureIterator features = source->getFeatures( QgsFeatureRequest().setNoAttributes() );
111  QHash< QgsFeatureId, int > idToObj;
112  while ( features.nextFeature( feat ) )
113  {
114  i++;
115  if ( feedback->isCanceled() )
116  {
117  break;
118  }
119 
120  feedback->setProgress( i * step );
121  if ( !feat.hasGeometry() )
122  continue;
123  featureWithGeometryCount++;
124 
125  QgsPointXY point;
127  point = QgsPointXY( *qgsgeometry_cast< const QgsPoint * >( feat.geometry().constGet() ) );
128  else
129  {
130  const QgsGeometry centroid = feat.geometry().centroid();
131  if ( centroid.isNull() )
132  continue; // centroid failed, e.g. empty linestring
133 
134  point = QgsPointXY( *qgsgeometry_cast< const QgsPoint * >( centroid.constGet() ) );
135  }
136 
137  n++;
138 
139  idToObj[ feat.id() ] = clusterFeatures.size();
140  clusterFeatures.emplace_back( Feature( point ) );
141  }
142 
143  if ( n < k )
144  {
145  feedback->reportError( QObject::tr( "Number of geometries is less than the number of clusters requested, not all clusters will get data" ) );
146  k = n;
147  }
148 
149  if ( k > 1 )
150  {
151  feedback->pushInfo( QObject::tr( "Calculating clusters" ) );
152 
153  // cluster centers
154  std::vector< QgsPointXY > centers( k );
155 
156  initClusters( clusterFeatures, centers, k, feedback );
157  calculateKMeans( clusterFeatures, centers, k, feedback );
158  }
159 
160  // cluster size
161  std::unordered_map< int, int> clusterSize;
162  for ( const int obj : idToObj )
163  clusterSize[ clusterFeatures[ obj ].cluster ]++;
164 
165  features = source->getFeatures();
166  i = 0;
167  while ( features.nextFeature( feat ) )
168  {
169  i++;
170  if ( feedback->isCanceled() )
171  {
172  break;
173  }
174 
175  feedback->setProgress( 50 + i * step );
176  QgsAttributes attr = feat.attributes();
177  const auto obj = idToObj.find( feat.id() );
178  if ( !feat.hasGeometry() || obj == idToObj.end() )
179  {
180  attr << QVariant() << QVariant();
181  }
182  else if ( k <= 1 )
183  {
184  attr << 0 << featureWithGeometryCount;
185  }
186  else
187  {
188  const int cluster = clusterFeatures[ *obj ].cluster;
189  attr << cluster << clusterSize[ cluster ];
190  }
191  feat.setAttributes( attr );
192  if ( !sink->addFeature( feat, QgsFeatureSink::FastInsert ) )
193  throw QgsProcessingException( writeFeatureError( sink.get(), parameters, QStringLiteral( "OUTPUT" ) ) );
194  }
195 
196  QVariantMap outputs;
197  outputs.insert( QStringLiteral( "OUTPUT" ), dest );
198  return outputs;
199 }
200 
201 // ported from https://github.com/postgis/postgis/blob/svn-trunk/liblwgeom/lwkmeans.c
202 
203 void QgsKMeansClusteringAlgorithm::initClusters( std::vector<Feature> &points, std::vector<QgsPointXY> &centers, const int k, QgsProcessingFeedback *feedback )
204 {
205  const std::size_t n = points.size();
206  if ( n == 0 )
207  return;
208 
209  if ( n == 1 )
210  {
211  for ( int i = 0; i < k; i++ )
212  centers[ i ] = points[ 0 ].point;
213  return;
214  }
215 
216  long duplicateCount = 1;
217  // initially scan for two most distance points from each other, p1 and p2
218  std::size_t p1 = 0;
219  std::size_t p2 = 0;
220  double distanceP1 = 0;
221  double distanceP2 = 0;
222  double maxDistance = -1;
223  for ( std::size_t i = 1; i < n; i++ )
224  {
225  distanceP1 = points[i].point.sqrDist( points[p1].point );
226  distanceP2 = points[i].point.sqrDist( points[p2].point );
227 
228  // if this point is further then existing candidates, replace our choice
229  if ( ( distanceP1 > maxDistance ) || ( distanceP2 > maxDistance ) )
230  {
231  maxDistance = std::max( distanceP1, distanceP2 );
232  if ( distanceP1 > distanceP2 )
233  p2 = i;
234  else
235  p1 = i;
236  }
237 
238  // also record count of duplicate points
239  if ( qgsDoubleNear( distanceP1, 0 ) || qgsDoubleNear( distanceP2, 0 ) )
240  duplicateCount++;
241  }
242 
243  if ( feedback && duplicateCount > 1 )
244  {
245  feedback->pushInfo( QObject::tr( "There are at least %1 duplicate inputs, the number of output clusters may be less than was requested" ).arg( duplicateCount ) );
246  }
247 
248  // By now two points should be found and be not the same
249  // Q_ASSERT( p1 != p2 && maxDistance >= 0 );
250 
251  // Accept these two points as our initial centers
252  centers[0] = points[p1].point;
253  centers[1] = points[p2].point;
254 
255  if ( k > 2 )
256  {
257  // array of minimum distance to a point from accepted cluster centers
258  std::vector< double > distances( n );
259 
260  // initialize array with distance to first object
261  for ( std::size_t j = 0; j < n; j++ )
262  {
263  distances[j] = points[j].point.sqrDist( centers[0] );
264  }
265  distances[p1] = -1;
266  distances[p2] = -1;
267 
268  // loop i on clusters, skip 0 and 1 as found already
269  for ( int i = 2; i < k; i++ )
270  {
271  std::size_t candidateCenter = 0;
272  double maxDistance = std::numeric_limits<double>::lowest();
273 
274  // loop j on points
275  for ( std::size_t j = 0; j < n; j++ )
276  {
277  // accepted clusters are already marked with distance = -1
278  if ( distances[j] < 0 )
279  continue;
280 
281  // update minimal distance with previously accepted cluster
282  distances[j] = std::min( points[j].point.sqrDist( centers[i - 1] ), distances[j] );
283 
284  // greedily take a point that's farthest from any of accepted clusters
285  if ( distances[j] > maxDistance )
286  {
287  candidateCenter = j;
288  maxDistance = distances[j];
289  }
290  }
291 
292  // checked earlier by counting entries on input, just in case
293  Q_ASSERT( maxDistance >= 0 );
294 
295  // accept candidate to centers
296  distances[candidateCenter] = -1;
297  // copy the point coordinates into the initial centers array
298  centers[i] = points[candidateCenter].point;
299  }
300  }
301 }
302 
303 // ported from https://github.com/postgis/postgis/blob/svn-trunk/liblwgeom/lwkmeans.c
304 
305 void QgsKMeansClusteringAlgorithm::calculateKMeans( std::vector<QgsKMeansClusteringAlgorithm::Feature> &objs, std::vector<QgsPointXY> &centers, int k, QgsProcessingFeedback *feedback )
306 {
307  int converged = false;
308  bool changed = false;
309 
310  // avoid reallocating weights array for every iteration
311  std::vector< uint > weights( k );
312 
313  uint i = 0;
314  for ( i = 0; i < KMEANS_MAX_ITERATIONS && !converged; i++ )
315  {
316  if ( feedback && feedback->isCanceled() )
317  break;
318 
319  findNearest( objs, centers, k, changed );
320  updateMeans( objs, centers, weights, k );
321  converged = !changed;
322  }
323 
324  if ( !converged && feedback )
325  feedback->reportError( QObject::tr( "Clustering did not converge after %1 iterations" ).arg( i ) );
326  else if ( feedback )
327  feedback->pushInfo( QObject::tr( "Clustering converged after %1 iterations" ).arg( i ) );
328 }
329 
330 // ported from https://github.com/postgis/postgis/blob/svn-trunk/liblwgeom/lwkmeans.c
331 
332 void QgsKMeansClusteringAlgorithm::findNearest( std::vector<QgsKMeansClusteringAlgorithm::Feature> &points, const std::vector<QgsPointXY> &centers, const int k, bool &changed )
333 {
334  changed = false;
335  const std::size_t n = points.size();
336  for ( std::size_t i = 0; i < n; i++ )
337  {
338  Feature &point = points[i];
339 
340  // Initialize with distance to first cluster
341  double currentDistance = point.point.sqrDist( centers[0] );
342  int currentCluster = 0;
343 
344  // Check all other cluster centers and find the nearest
345  for ( int cluster = 1; cluster < k; cluster++ )
346  {
347  const double distance = point.point.sqrDist( centers[cluster] );
348  if ( distance < currentDistance )
349  {
350  currentDistance = distance;
351  currentCluster = cluster;
352  }
353  }
354 
355  // Store the nearest cluster this object is in
356  if ( point.cluster != currentCluster )
357  {
358  changed = true;
359  point.cluster = currentCluster;
360  }
361  }
362 }
363 
364 // ported from https://github.com/postgis/postgis/blob/svn-trunk/liblwgeom/lwkmeans.c
365 
366 void QgsKMeansClusteringAlgorithm::updateMeans( const std::vector<Feature> &points, std::vector<QgsPointXY> &centers, std::vector<uint> &weights, const int k )
367 {
368  const uint n = points.size();
369  std::fill( weights.begin(), weights.end(), 0 );
370  for ( int i = 0; i < k; i++ )
371  {
372  centers[i].setX( 0.0 );
373  centers[i].setY( 0.0 );
374  }
375  for ( uint i = 0; i < n; i++ )
376  {
377  const int cluster = points[i].cluster;
378  centers[cluster] += QgsVector( points[i].point.x(),
379  points[i].point.y() );
380  weights[cluster] += 1;
381  }
382  for ( int i = 0; i < k; i++ )
383  {
384  centers[i] /= weights[i];
385  }
386 }
387 
388 
390 
391 
A vector of attributes.
Definition: qgsattributes.h:58
Wrapper for iterator of features from vector data provider or vector layer.
bool nextFeature(QgsFeature &f)
This class wraps a request for features to a vector layer (or directly its vector data provider).
@ FastInsert
Use faster inserts, at the cost of updating the passed features to reflect changes made at the provid...
The feature class encapsulates a single feature including its unique ID, geometry and a list of field...
Definition: qgsfeature.h:56
QgsAttributes attributes
Definition: qgsfeature.h:65
void setAttributes(const QgsAttributes &attrs)
Sets the feature's attributes.
Definition: qgsfeature.cpp:153
QgsGeometry geometry
Definition: qgsfeature.h:67
bool hasGeometry() const
Returns true if the feature has an associated geometry.
Definition: qgsfeature.cpp:223
Q_GADGET QgsFeatureId id
Definition: qgsfeature.h:64
bool isCanceled() const SIP_HOLDGIL
Tells whether the operation has been canceled already.
Definition: qgsfeedback.h:54
void setProgress(double progress)
Sets the current progress for the feedback object.
Definition: qgsfeedback.h:63
Encapsulate a field in an attribute table or data source.
Definition: qgsfield.h:51
Container of fields for a vector layer.
Definition: qgsfields.h:45
bool append(const QgsField &field, FieldOrigin origin=OriginProvider, int originIndex=-1)
Appends a field. The field must have unique name, otherwise it is rejected (returns false)
Definition: qgsfields.cpp:59
A geometry is the spatial representation of a feature.
Definition: qgsgeometry.h:125
const QgsAbstractGeometry * constGet() const SIP_HOLDGIL
Returns a non-modifiable (const) reference to the underlying abstract geometry primitive.
QgsWkbTypes::Type wkbType() const SIP_HOLDGIL
Returns type of the geometry as a WKB type (point / linestring / polygon etc.)
QgsGeometry centroid() const
Returns the center of mass of a geometry.
A class to represent a 2D point.
Definition: qgspointxy.h:59
Contains information about the context in which a processing algorithm is executed.
Custom exception class for processing related exceptions.
Definition: qgsexception.h:83
Base class for providing feedback from a processing algorithm.
virtual void pushInfo(const QString &info)
Pushes a general informational message from the algorithm.
virtual void reportError(const QString &error, bool fatalError=false)
Reports that the algorithm encountered an error while executing.
@ FlagAdvanced
Parameter is an advanced parameter which should be hidden from users by default.
A feature sink output for processing algorithms.
An input feature source (such as vector layers) parameter for processing algorithms.
A numeric parameter for processing algorithms.
static QgsFields combineFields(const QgsFields &fieldsA, const QgsFields &fieldsB, const QString &fieldsBPrefix=QString())
Combines two field lists, avoiding duplicate field names (in a case-insensitive manner).
@ TypeVectorAnyGeometry
Any vector layer with geometry.
Definition: qgsprocessing.h:48
A class to represent a vector.
Definition: qgsvector.h:30
static Type flatType(Type type) SIP_HOLDGIL
Returns the flat type for a WKB type.
Definition: qgswkbtypes.h:732
CORE_EXPORT QgsMeshVertex centroid(const QgsMeshFace &face, const QVector< QgsMeshVertex > &vertices)
Returns the centroid of the face.
bool qgsDoubleNear(double a, double b, double epsilon=4 *std::numeric_limits< double >::epsilon())
Compare two doubles (but allow some difference)
Definition: qgis.h:1246