QGIS API Documentation  3.16.0-Hannover (43b64b13f3)
qgsalgorithmkmeansclustering.cpp
Go to the documentation of this file.
1 /***************************************************************************
2  qgsalgorithmkmeansclustering.cpp
3  ---------------------
4  begin : June 2018
5  copyright : (C) 2018 by Nyall Dawson
6  email : nyall dot dawson at gmail dot com
7  ***************************************************************************/
8 
9 /***************************************************************************
10  * *
11  * This program is free software; you can redistribute it and/or modify *
12  * it under the terms of the GNU General Public License as published by *
13  * the Free Software Foundation; either version 2 of the License, or *
14  * (at your option) any later version. *
15  * *
16  ***************************************************************************/
17 
19 #include <unordered_map>
20 
22 
23 const int KMEANS_MAX_ITERATIONS = 1000;
24 
25 QString QgsKMeansClusteringAlgorithm::name() const
26 {
27  return QStringLiteral( "kmeansclustering" );
28 }
29 
30 QString QgsKMeansClusteringAlgorithm::displayName() const
31 {
32  return QObject::tr( "K-means clustering" );
33 }
34 
35 QStringList QgsKMeansClusteringAlgorithm::tags() const
36 {
37  return QObject::tr( "clustering,clusters,kmeans,points" ).split( ',' );
38 }
39 
40 QString QgsKMeansClusteringAlgorithm::group() const
41 {
42  return QObject::tr( "Vector analysis" );
43 }
44 
45 QString QgsKMeansClusteringAlgorithm::groupId() const
46 {
47  return QStringLiteral( "vectoranalysis" );
48 }
49 
50 void QgsKMeansClusteringAlgorithm::initAlgorithm( const QVariantMap & )
51 {
52  addParameter( new QgsProcessingParameterFeatureSource( QStringLiteral( "INPUT" ),
53  QObject::tr( "Input layer" ), QList< int >() << QgsProcessing::TypeVectorAnyGeometry ) );
54  addParameter( new QgsProcessingParameterNumber( QStringLiteral( "CLUSTERS" ), QObject::tr( "Number of clusters" ),
56 
57  auto fieldNameParam = qgis::make_unique<QgsProcessingParameterString>( QStringLiteral( "FIELD_NAME" ),
58  QObject::tr( "Cluster field name" ), QStringLiteral( "CLUSTER_ID" ) );
59  fieldNameParam->setFlags( fieldNameParam->flags() | QgsProcessingParameterDefinition::FlagAdvanced );
60  addParameter( fieldNameParam.release() );
61  auto sizeFieldNameParam = qgis::make_unique<QgsProcessingParameterString>( QStringLiteral( "SIZE_FIELD_NAME" ),
62  QObject::tr( "Cluster size field name" ), QStringLiteral( "CLUSTER_SIZE" ) );
63  sizeFieldNameParam->setFlags( sizeFieldNameParam->flags() | QgsProcessingParameterDefinition::FlagAdvanced );
64  addParameter( sizeFieldNameParam.release() );
65 
66  addParameter( new QgsProcessingParameterFeatureSink( QStringLiteral( "OUTPUT" ), QObject::tr( "Clusters" ), QgsProcessing::TypeVectorAnyGeometry ) );
67 }
68 
69 QString QgsKMeansClusteringAlgorithm::shortHelpString() const
70 {
71  return QObject::tr( "Calculates the 2D distance based k-means cluster number for each input feature.\n\n"
72  "If input geometries are lines or polygons, the clustering is based on the centroid of the feature." );
73 }
74 
75 QgsKMeansClusteringAlgorithm *QgsKMeansClusteringAlgorithm::createInstance() const
76 {
77  return new QgsKMeansClusteringAlgorithm();
78 }
79 
80 QVariantMap QgsKMeansClusteringAlgorithm::processAlgorithm( const QVariantMap &parameters, QgsProcessingContext &context, QgsProcessingFeedback *feedback )
81 {
82  std::unique_ptr< QgsProcessingFeatureSource > source( parameterAsSource( parameters, QStringLiteral( "INPUT" ), context ) );
83  if ( !source )
84  throw QgsProcessingException( invalidSourceError( parameters, QStringLiteral( "INPUT" ) ) );
85 
86  int k = parameterAsInt( parameters, QStringLiteral( "CLUSTERS" ), context );
87 
88  QgsFields outputFields = source->fields();
89  QgsFields newFields;
90  const QString clusterFieldName = parameterAsString( parameters, QStringLiteral( "FIELD_NAME" ), context );
91  newFields.append( QgsField( clusterFieldName, QVariant::Int ) );
92  const QString clusterSizeFieldName = parameterAsString( parameters, QStringLiteral( "SIZE_FIELD_NAME" ), context );
93  newFields.append( QgsField( clusterSizeFieldName, QVariant::Int ) );
94  outputFields = QgsProcessingUtils::combineFields( outputFields, newFields );
95 
96  QString dest;
97  std::unique_ptr< QgsFeatureSink > sink( parameterAsSink( parameters, QStringLiteral( "OUTPUT" ), context, dest, outputFields, source->wkbType(), source->sourceCrs() ) );
98  if ( !sink )
99  throw QgsProcessingException( invalidSinkError( parameters, QStringLiteral( "OUTPUT" ) ) );
100 
101  // build list of point inputs - if it's already a point, use that. If not, take the centroid.
102  feedback->pushInfo( QObject::tr( "Collecting input points" ) );
103  double step = source->featureCount() > 0 ? 50.0 / source->featureCount() : 1;
104  int i = 0;
105  int n = 0;
106  int featureWithGeometryCount = 0;
107  QgsFeature feat;
108 
109  std::vector< Feature > clusterFeatures;
110  QgsFeatureIterator features = source->getFeatures( QgsFeatureRequest().setNoAttributes() );
111  QHash< QgsFeatureId, int > idToObj;
112  while ( features.nextFeature( feat ) )
113  {
114  i++;
115  if ( feedback->isCanceled() )
116  {
117  break;
118  }
119 
120  feedback->setProgress( i * step );
121  if ( !feat.hasGeometry() )
122  continue;
123  featureWithGeometryCount++;
124 
125  QgsPointXY point;
127  point = QgsPointXY( *qgsgeometry_cast< const QgsPoint * >( feat.geometry().constGet() ) );
128  else
129  {
130  QgsGeometry centroid = feat.geometry().centroid();
131  if ( centroid.isNull() )
132  continue; // centroid failed, e.g. empty linestring
133 
134  point = QgsPointXY( *qgsgeometry_cast< const QgsPoint * >( centroid.constGet() ) );
135  }
136 
137  n++;
138 
139  idToObj[ feat.id() ] = clusterFeatures.size();
140  clusterFeatures.emplace_back( Feature( point ) );
141  }
142 
143  if ( n < k )
144  {
145  feedback->reportError( QObject::tr( "Number of geometries is less than the number of clusters requested, not all clusters will get data" ) );
146  k = n;
147  }
148 
149  if ( k > 1 )
150  {
151  feedback->pushInfo( QObject::tr( "Calculating clusters" ) );
152 
153  // cluster centers
154  std::vector< QgsPointXY > centers( k );
155 
156  initClusters( clusterFeatures, centers, k, feedback );
157  calculateKMeans( clusterFeatures, centers, k, feedback );
158  }
159 
160  // cluster size
161  std::unordered_map< int, int> clusterSize;
162  for ( int obj : idToObj )
163  clusterSize[ clusterFeatures[ obj ].cluster ]++;
164 
165  features = source->getFeatures();
166  i = 0;
167  while ( features.nextFeature( feat ) )
168  {
169  i++;
170  if ( feedback->isCanceled() )
171  {
172  break;
173  }
174 
175  feedback->setProgress( 50 + i * step );
176  QgsAttributes attr = feat.attributes();
177  auto obj = idToObj.find( feat.id() );
178  if ( !feat.hasGeometry() || obj == idToObj.end() )
179  {
180  attr << QVariant() << QVariant();
181  }
182  else if ( k <= 1 )
183  {
184  attr << 0 << featureWithGeometryCount;
185  }
186  else
187  {
188  int cluster = clusterFeatures[ *obj ].cluster;
189  attr << cluster << clusterSize[ cluster ];
190  }
191  feat.setAttributes( attr );
192  sink->addFeature( feat, QgsFeatureSink::FastInsert );
193  }
194 
195  QVariantMap outputs;
196  outputs.insert( QStringLiteral( "OUTPUT" ), dest );
197  return outputs;
198 }
199 
200 // ported from https://github.com/postgis/postgis/blob/svn-trunk/liblwgeom/lwkmeans.c
201 
202 void QgsKMeansClusteringAlgorithm::initClusters( std::vector<Feature> &points, std::vector<QgsPointXY> &centers, const int k, QgsProcessingFeedback *feedback )
203 {
204  std::size_t n = points.size();
205  if ( n == 0 )
206  return;
207 
208  if ( n == 1 )
209  {
210  for ( int i = 0; i < k; i++ )
211  centers[ i ] = points[ 0 ].point;
212  return;
213  }
214 
215  long duplicateCount = 1;
216  // initially scan for two most distance points from each other, p1 and p2
217  std::size_t p1 = 0;
218  std::size_t p2 = 0;
219  double distanceP1 = 0;
220  double distanceP2 = 0;
221  double maxDistance = -1;
222  for ( std::size_t i = 1; i < n; i++ )
223  {
224  distanceP1 = points[i].point.sqrDist( points[p1].point );
225  distanceP2 = points[i].point.sqrDist( points[p2].point );
226 
227  // if this point is further then existing candidates, replace our choice
228  if ( ( distanceP1 > maxDistance ) || ( distanceP2 > maxDistance ) )
229  {
230  maxDistance = std::max( distanceP1, distanceP2 );
231  if ( distanceP1 > distanceP2 )
232  p2 = i;
233  else
234  p1 = i;
235  }
236 
237  // also record count of duplicate points
238  if ( qgsDoubleNear( distanceP1, 0 ) || qgsDoubleNear( distanceP2, 0 ) )
239  duplicateCount++;
240  }
241 
242  if ( feedback && duplicateCount > 1 )
243  {
244  feedback->pushInfo( QObject::tr( "There are at least %1 duplicate inputs, the number of output clusters may be less than was requested" ).arg( duplicateCount ) );
245  }
246 
247  // By now two points should be found and be not the same
248  // Q_ASSERT( p1 != p2 && maxDistance >= 0 );
249 
250  // Accept these two points as our initial centers
251  centers[0] = points[p1].point;
252  centers[1] = points[p2].point;
253 
254  if ( k > 2 )
255  {
256  // array of minimum distance to a point from accepted cluster centers
257  std::vector< double > distances( n );
258 
259  // initialize array with distance to first object
260  for ( std::size_t j = 0; j < n; j++ )
261  {
262  distances[j] = points[j].point.sqrDist( centers[0] );
263  }
264  distances[p1] = -1;
265  distances[p2] = -1;
266 
267  // loop i on clusters, skip 0 and 1 as found already
268  for ( int i = 2; i < k; i++ )
269  {
270  std::size_t candidateCenter = 0;
271  double maxDistance = std::numeric_limits<double>::lowest();
272 
273  // loop j on points
274  for ( std::size_t j = 0; j < n; j++ )
275  {
276  // accepted clusters are already marked with distance = -1
277  if ( distances[j] < 0 )
278  continue;
279 
280  // update minimal distance with previously accepted cluster
281  distances[j] = std::min( points[j].point.sqrDist( centers[i - 1] ), distances[j] );
282 
283  // greedily take a point that's farthest from any of accepted clusters
284  if ( distances[j] > maxDistance )
285  {
286  candidateCenter = j;
287  maxDistance = distances[j];
288  }
289  }
290 
291  // checked earlier by counting entries on input, just in case
292  Q_ASSERT( maxDistance >= 0 );
293 
294  // accept candidate to centers
295  distances[candidateCenter] = -1;
296  // copy the point coordinates into the initial centers array
297  centers[i] = points[candidateCenter].point;
298  }
299  }
300 }
301 
302 // ported from https://github.com/postgis/postgis/blob/svn-trunk/liblwgeom/lwkmeans.c
303 
304 void QgsKMeansClusteringAlgorithm::calculateKMeans( std::vector<QgsKMeansClusteringAlgorithm::Feature> &objs, std::vector<QgsPointXY> &centers, int k, QgsProcessingFeedback *feedback )
305 {
306  int converged = false;
307  bool changed = false;
308 
309  // avoid reallocating weights array for every iteration
310  std::vector< uint > weights( k );
311 
312  uint i = 0;
313  for ( i = 0; i < KMEANS_MAX_ITERATIONS && !converged; i++ )
314  {
315  if ( feedback && feedback->isCanceled() )
316  break;
317 
318  findNearest( objs, centers, k, changed );
319  updateMeans( objs, centers, weights, k );
320  converged = !changed;
321  }
322 
323  if ( !converged && feedback )
324  feedback->reportError( QObject::tr( "Clustering did not converge after %1 iterations" ).arg( i ) );
325  else if ( feedback )
326  feedback->pushInfo( QObject::tr( "Clustering converged after %1 iterations" ).arg( i ) );
327 }
328 
329 // ported from https://github.com/postgis/postgis/blob/svn-trunk/liblwgeom/lwkmeans.c
330 
331 void QgsKMeansClusteringAlgorithm::findNearest( std::vector<QgsKMeansClusteringAlgorithm::Feature> &points, const std::vector<QgsPointXY> &centers, const int k, bool &changed )
332 {
333  changed = false;
334  std::size_t n = points.size();
335  for ( std::size_t i = 0; i < n; i++ )
336  {
337  Feature &point = points[i];
338 
339  // Initialize with distance to first cluster
340  double currentDistance = point.point.sqrDist( centers[0] );
341  int currentCluster = 0;
342 
343  // Check all other cluster centers and find the nearest
344  for ( int cluster = 1; cluster < k; cluster++ )
345  {
346  const double distance = point.point.sqrDist( centers[cluster] );
347  if ( distance < currentDistance )
348  {
349  currentDistance = distance;
350  currentCluster = cluster;
351  }
352  }
353 
354  // Store the nearest cluster this object is in
355  if ( point.cluster != currentCluster )
356  {
357  changed = true;
358  point.cluster = currentCluster;
359  }
360  }
361 }
362 
363 // ported from https://github.com/postgis/postgis/blob/svn-trunk/liblwgeom/lwkmeans.c
364 
365 void QgsKMeansClusteringAlgorithm::updateMeans( const std::vector<Feature> &points, std::vector<QgsPointXY> &centers, std::vector<uint> &weights, const int k )
366 {
367  uint n = points.size();
368  std::fill( weights.begin(), weights.end(), 0 );
369  for ( int i = 0; i < k; i++ )
370  {
371  centers[i].setX( 0.0 );
372  centers[i].setY( 0.0 );
373  }
374  for ( uint i = 0; i < n; i++ )
375  {
376  int cluster = points[i].cluster;
377  centers[cluster] += QgsVector( points[i].point.x(),
378  points[i].point.y() );
379  weights[cluster] += 1;
380  }
381  for ( int i = 0; i < k; i++ )
382  {
383  centers[i] /= weights[i];
384  }
385 }
386 
387 
389 
390 
QgsFeedback::setProgress
void setProgress(double progress)
Sets the current progress for the feedback object.
Definition: qgsfeedback.h:62
QgsFeature::id
Q_GADGET QgsFeatureId id
Definition: qgsfeature.h:64
QgsWkbTypes::Point
@ Point
Definition: qgswkbtypes.h:72
QgsProcessingParameterNumber
A numeric parameter for processing algorithms.
Definition: qgsprocessingparameters.h:1960
QgsProcessingFeedback
Base class for providing feedback from a processing algorithm.
Definition: qgsprocessingfeedback.h:38
QgsGeometry::isNull
Q_GADGET bool isNull
Definition: qgsgeometry.h:126
QgsProcessingFeedback::pushInfo
virtual void pushInfo(const QString &info)
Pushes a general informational message from the algorithm.
Definition: qgsprocessingfeedback.cpp:48
QgsWkbTypes::flatType
static Type flatType(Type type) SIP_HOLDGIL
Returns the flat type for a WKB type.
Definition: qgswkbtypes.h:702
QgsProcessingFeedback::reportError
virtual void reportError(const QString &error, bool fatalError=false)
Reports that the algorithm encountered an error while executing.
Definition: qgsprocessingfeedback.cpp:39
QgsFields
Container of fields for a vector layer.
Definition: qgsfields.h:45
QgsGeometry::centroid
QgsGeometry centroid() const
Returns the center of mass of a geometry.
Definition: qgsgeometry.cpp:2138
QgsFeature::geometry
QgsGeometry geometry
Definition: qgsfeature.h:67
QgsProcessingParameterDefinition::FlagAdvanced
@ FlagAdvanced
Parameter is an advanced parameter which should be hidden from users by default.
Definition: qgsprocessingparameters.h:423
QgsProcessingParameterFeatureSource
An input feature source (such as vector layers) parameter for processing algorithms.
Definition: qgsprocessingparameters.h:2734
QgsFields::append
bool append(const QgsField &field, FieldOrigin origin=OriginProvider, int originIndex=-1)
Appends a field. The field must have unique name, otherwise it is rejected (returns false)
Definition: qgsfields.cpp:59
qgsalgorithmkmeansclustering.h
QgsProcessingUtils::combineFields
static QgsFields combineFields(const QgsFields &fieldsA, const QgsFields &fieldsB, const QString &fieldsBPrefix=QString())
Combines two field lists, avoiding duplicate field names (in a case-insensitive manner).
Definition: qgsprocessingutils.cpp:1127
QgsProcessingParameterFeatureSink
A feature sink output for processing algorithms.
Definition: qgsprocessingparameters.h:2895
QgsFeatureRequest
This class wraps a request for features to a vector layer (or directly its vector data provider).
Definition: qgsfeaturerequest.h:76
QgsProcessingContext
Contains information about the context in which a processing algorithm is executed.
Definition: qgsprocessingcontext.h:44
qgsDoubleNear
bool qgsDoubleNear(double a, double b, double epsilon=4 *std::numeric_limits< double >::epsilon())
Compare two doubles (but allow some difference)
Definition: qgis.h:315
QgsProcessing::TypeVectorAnyGeometry
@ TypeVectorAnyGeometry
Any vector layer with geometry.
Definition: qgsprocessing.h:47
QgsGeometry::constGet
const QgsAbstractGeometry * constGet() const SIP_HOLDGIL
Returns a non-modifiable (const) reference to the underlying abstract geometry primitive.
Definition: qgsgeometry.cpp:128
QgsFeature::attributes
QgsAttributes attributes
Definition: qgsfeature.h:65
QgsProcessingParameterNumber::Integer
@ Integer
Integer values.
Definition: qgsprocessingparameters.h:1966
QgsPointXY
A class to represent a 2D point.
Definition: qgspointxy.h:44
QgsFeedback::isCanceled
bool isCanceled() const
Tells whether the operation has been canceled already.
Definition: qgsfeedback.h:53
QgsFeatureIterator::nextFeature
bool nextFeature(QgsFeature &f)
Definition: qgsfeatureiterator.h:374
QgsGeometry
A geometry is the spatial representation of a feature.
Definition: qgsgeometry.h:124
QgsFeature::hasGeometry
bool hasGeometry() const
Returns true if the feature has an associated geometry.
Definition: qgsfeature.cpp:199
QgsVector
A class to represent a vector.
Definition: qgsvector.h:30
QgsAttributes
A vector of attributes.
Definition: qgsattributes.h:58
QgsFeature
The feature class encapsulates a single feature including its id, geometry and a list of field/values...
Definition: qgsfeature.h:56
QgsFeature::setAttributes
void setAttributes(const QgsAttributes &attrs)
Sets the feature's attributes.
Definition: qgsfeature.cpp:129
QgsFeatureIterator
Wrapper for iterator of features from vector data provider or vector layer.
Definition: qgsfeatureiterator.h:265
QgsProcessingException
Custom exception class for processing related exceptions.
Definition: qgsexception.h:83
QgsGeometry::wkbType
QgsWkbTypes::Type wkbType() const SIP_HOLDGIL
Returns type of the geometry as a WKB type (point / linestring / polygon etc.)
Definition: qgsgeometry.cpp:345
QgsFeatureSink::FastInsert
@ FastInsert
Use faster inserts, at the cost of updating the passed features to reflect changes made at the provid...
Definition: qgsfeaturesink.h:70
QgsField
Encapsulate a field in an attribute table or data source.
Definition: qgsfield.h:50