QGIS API Documentation  3.10.0-A Coruña (6c816b4204)
qgsalgorithmkmeansclustering.cpp
Go to the documentation of this file.
1 /***************************************************************************
2  qgsalgorithmkmeansclustering.cpp
3  ---------------------
4  begin : June 2018
5  copyright : (C) 2018 by Nyall Dawson
6  email : nyall dot dawson at gmail dot com
7  ***************************************************************************/
8 
9 /***************************************************************************
10  * *
11  * This program is free software; you can redistribute it and/or modify *
12  * it under the terms of the GNU General Public License as published by *
13  * the Free Software Foundation; either version 2 of the License, or *
14  * (at your option) any later version. *
15  * *
16  ***************************************************************************/
17 
19 
21 
22 const int KMEANS_MAX_ITERATIONS = 1000;
23 
24 QString QgsKMeansClusteringAlgorithm::name() const
25 {
26  return QStringLiteral( "kmeansclustering" );
27 }
28 
29 QString QgsKMeansClusteringAlgorithm::displayName() const
30 {
31  return QObject::tr( "K-means clustering" );
32 }
33 
34 QStringList QgsKMeansClusteringAlgorithm::tags() const
35 {
36  return QObject::tr( "clustering,clusters,kmeans,points" ).split( ',' );
37 }
38 
39 QString QgsKMeansClusteringAlgorithm::group() const
40 {
41  return QObject::tr( "Vector analysis" );
42 }
43 
44 QString QgsKMeansClusteringAlgorithm::groupId() const
45 {
46  return QStringLiteral( "vectoranalysis" );
47 }
48 
49 void QgsKMeansClusteringAlgorithm::initAlgorithm( const QVariantMap & )
50 {
51  addParameter( new QgsProcessingParameterFeatureSource( QStringLiteral( "INPUT" ),
52  QObject::tr( "Input layer" ), QList< int >() << QgsProcessing::TypeVectorAnyGeometry ) );
53  addParameter( new QgsProcessingParameterNumber( QStringLiteral( "CLUSTERS" ), QObject::tr( "Number of clusters" ),
55  addParameter( new QgsProcessingParameterString( QStringLiteral( "FIELD_NAME" ),
56  QObject::tr( "Cluster field name" ), QStringLiteral( "CLUSTER_ID" ) ) );
57  addParameter( new QgsProcessingParameterFeatureSink( QStringLiteral( "OUTPUT" ), QObject::tr( "Clusters" ), QgsProcessing::TypeVectorAnyGeometry ) );
58 }
59 
60 QString QgsKMeansClusteringAlgorithm::shortHelpString() const
61 {
62  return QObject::tr( "Calculates the 2D distance based k-means cluster number for each input feature.\n\n"
63  "If input geometries are lines or polygons, the clustering is based on the centroid of the feature." );
64 }
65 
66 QgsKMeansClusteringAlgorithm *QgsKMeansClusteringAlgorithm::createInstance() const
67 {
68  return new QgsKMeansClusteringAlgorithm();
69 }
70 
71 QVariantMap QgsKMeansClusteringAlgorithm::processAlgorithm( const QVariantMap &parameters, QgsProcessingContext &context, QgsProcessingFeedback *feedback )
72 {
73  std::unique_ptr< QgsProcessingFeatureSource > source( parameterAsSource( parameters, QStringLiteral( "INPUT" ), context ) );
74  if ( !source )
75  throw QgsProcessingException( invalidSourceError( parameters, QStringLiteral( "INPUT" ) ) );
76 
77  int k = parameterAsInt( parameters, QStringLiteral( "CLUSTERS" ), context );
78 
79  QgsFields outputFields = source->fields();
80  const QString clusterFieldName = parameterAsString( parameters, QStringLiteral( "FIELD_NAME" ), context );
81  QgsFields newFields;
82  newFields.append( QgsField( clusterFieldName, QVariant::Int ) );
83  outputFields = QgsProcessingUtils::combineFields( outputFields, newFields );
84 
85  QString dest;
86  std::unique_ptr< QgsFeatureSink > sink( parameterAsSink( parameters, QStringLiteral( "OUTPUT" ), context, dest, outputFields, source->wkbType(), source->sourceCrs() ) );
87  if ( !sink )
88  throw QgsProcessingException( invalidSinkError( parameters, QStringLiteral( "OUTPUT" ) ) );
89 
90  // build list of point inputs - if it's already a point, use that. If not, take the centroid.
91  feedback->pushInfo( QObject::tr( "Collecting input points" ) );
92  double step = source->featureCount() > 0 ? 50.0 / source->featureCount() : 1;
93  int i = 0;
94  int n = 0;
95  QgsFeature feat;
96 
97  std::vector< Feature > clusterFeatures;
98  QgsFeatureIterator features = source->getFeatures( QgsFeatureRequest().setNoAttributes() );
99  QHash< QgsFeatureId, int > idToObj;
100  while ( features.nextFeature( feat ) )
101  {
102  i++;
103  if ( feedback->isCanceled() )
104  {
105  break;
106  }
107 
108  feedback->setProgress( i * step );
109  if ( !feat.hasGeometry() )
110  continue;
111 
112  QgsPointXY point;
114  point = QgsPointXY( *qgsgeometry_cast< const QgsPoint * >( feat.geometry().constGet() ) );
115  else
116  {
117  QgsGeometry centroid = feat.geometry().centroid();
118  if ( centroid.isNull() )
119  continue; // centroid failed, e.g. empty linestring
120 
121  point = QgsPointXY( *qgsgeometry_cast< const QgsPoint * >( centroid.constGet() ) );
122  }
123 
124  n++;
125 
126  idToObj.insert( feat.id(), clusterFeatures.size() );
127  clusterFeatures.emplace_back( Feature( point ) );
128  }
129 
130  if ( n < k )
131  {
132  feedback->reportError( QObject::tr( "Number of geometries is less than the number of clusters requested, not all clusters will get data" ) );
133  k = n;
134  }
135 
136  if ( k > 1 )
137  {
138  feedback->pushInfo( QObject::tr( "Calculating clusters" ) );
139 
140  // cluster centers
141  std::vector< QgsPointXY > centers( k );
142 
143  initClusters( clusterFeatures, centers, k, feedback );
144  calculateKMeans( clusterFeatures, centers, k, feedback );
145  }
146 
147  features = source->getFeatures();
148  i = 0;
149  while ( features.nextFeature( feat ) )
150  {
151  i++;
152  if ( feedback->isCanceled() )
153  {
154  break;
155  }
156 
157  feedback->setProgress( 50 + i * step );
158  QgsAttributes attr = feat.attributes();
159  if ( !feat.hasGeometry() )
160  {
161  attr << QVariant();
162  }
163  else if ( k <= 1 )
164  {
165  attr << 0;
166  }
167  else if ( !idToObj.contains( feat.id() ) )
168  {
169  attr << QVariant();
170  }
171  else
172  {
173  attr << clusterFeatures[ idToObj.value( feat.id() ) ].cluster;
174  }
175  feat.setAttributes( attr );
176  sink->addFeature( feat, QgsFeatureSink::FastInsert );
177  }
178 
179  QVariantMap outputs;
180  outputs.insert( QStringLiteral( "OUTPUT" ), dest );
181  return outputs;
182 }
183 
184 // ported from https://github.com/postgis/postgis/blob/svn-trunk/liblwgeom/lwkmeans.c
185 
186 void QgsKMeansClusteringAlgorithm::initClusters( std::vector<Feature> &points, std::vector<QgsPointXY> &centers, const int k, QgsProcessingFeedback *feedback )
187 {
188  std::size_t n = points.size();
189  if ( n == 0 )
190  return;
191 
192  if ( n == 1 )
193  {
194  for ( int i = 0; i < k; i++ )
195  centers[ i ] = points[ 0 ].point;
196  return;
197  }
198 
199  long duplicateCount = 1;
200  // initially scan for two most distance points from each other, p1 and p2
201  std::size_t p1 = 0;
202  std::size_t p2 = 0;
203  double distanceP1 = 0;
204  double distanceP2 = 0;
205  double maxDistance = -1;
206  for ( std::size_t i = 1; i < n; i++ )
207  {
208  distanceP1 = points[i].point.sqrDist( points[p1].point );
209  distanceP2 = points[i].point.sqrDist( points[p2].point );
210 
211  // if this point is further then existing candidates, replace our choice
212  if ( ( distanceP1 > maxDistance ) || ( distanceP2 > maxDistance ) )
213  {
214  maxDistance = std::max( distanceP1, distanceP2 );
215  if ( distanceP1 > distanceP2 )
216  p2 = i;
217  else
218  p1 = i;
219  }
220 
221  // also record count of duplicate points
222  if ( qgsDoubleNear( distanceP1, 0 ) || qgsDoubleNear( distanceP2, 0 ) )
223  duplicateCount++;
224  }
225 
226  if ( feedback && duplicateCount > 1 )
227  {
228  feedback->pushInfo( QObject::tr( "There are at least %1 duplicate inputs, the number of output clusters may be less than was requested" ).arg( duplicateCount ) );
229  }
230 
231  // By now two points should be found and be not the same
232  // Q_ASSERT( p1 != p2 && maxDistance >= 0 );
233 
234  // Accept these two points as our initial centers
235  centers[0] = points[p1].point;
236  centers[1] = points[p2].point;
237 
238  if ( k > 2 )
239  {
240  // array of minimum distance to a point from accepted cluster centers
241  std::vector< double > distances( n );
242 
243  // initialize array with distance to first object
244  for ( std::size_t j = 0; j < n; j++ )
245  {
246  distances[j] = points[j].point.sqrDist( centers[0] );
247  }
248  distances[p1] = -1;
249  distances[p2] = -1;
250 
251  // loop i on clusters, skip 0 and 1 as found already
252  for ( int i = 2; i < k; i++ )
253  {
254  std::size_t candidateCenter = 0;
255  double maxDistance = std::numeric_limits<double>::lowest();
256 
257  // loop j on points
258  for ( std::size_t j = 0; j < n; j++ )
259  {
260  // accepted clusters are already marked with distance = -1
261  if ( distances[j] < 0 )
262  continue;
263 
264  // update minimal distance with previously accepted cluster
265  distances[j] = std::min( points[j].point.sqrDist( centers[i - 1] ), distances[j] );
266 
267  // greedily take a point that's farthest from any of accepted clusters
268  if ( distances[j] > maxDistance )
269  {
270  candidateCenter = j;
271  maxDistance = distances[j];
272  }
273  }
274 
275  // checked earlier by counting entries on input, just in case
276  Q_ASSERT( maxDistance >= 0 );
277 
278  // accept candidate to centers
279  distances[candidateCenter] = -1;
280  // copy the point coordinates into the initial centers array
281  centers[i] = points[candidateCenter].point;
282  }
283  }
284 }
285 
286 // ported from https://github.com/postgis/postgis/blob/svn-trunk/liblwgeom/lwkmeans.c
287 
288 void QgsKMeansClusteringAlgorithm::calculateKMeans( std::vector<QgsKMeansClusteringAlgorithm::Feature> &objs, std::vector<QgsPointXY> &centers, int k, QgsProcessingFeedback *feedback )
289 {
290  int converged = false;
291  bool changed = false;
292 
293  // avoid reallocating weights array for every iteration
294  std::vector< uint > weights( k );
295 
296  uint i = 0;
297  for ( i = 0; i < KMEANS_MAX_ITERATIONS && !converged; i++ )
298  {
299  if ( feedback && feedback->isCanceled() )
300  break;
301 
302  findNearest( objs, centers, k, changed );
303  updateMeans( objs, centers, weights, k );
304  converged = !changed;
305  }
306 
307  if ( !converged && feedback )
308  feedback->reportError( QObject::tr( "Clustering did not converge after %1 iterations" ).arg( i ) );
309  else if ( feedback )
310  feedback->pushInfo( QObject::tr( "Clustering converged after %1 iterations" ).arg( i ) );
311 }
312 
313 // ported from https://github.com/postgis/postgis/blob/svn-trunk/liblwgeom/lwkmeans.c
314 
315 void QgsKMeansClusteringAlgorithm::findNearest( std::vector<QgsKMeansClusteringAlgorithm::Feature> &points, const std::vector<QgsPointXY> &centers, const int k, bool &changed )
316 {
317  changed = false;
318  std::size_t n = points.size();
319  for ( std::size_t i = 0; i < n; i++ )
320  {
321  Feature &point = points[i];
322 
323  // Initialize with distance to first cluster
324  double currentDistance = point.point.sqrDist( centers[0] );
325  int currentCluster = 0;
326 
327  // Check all other cluster centers and find the nearest
328  for ( int cluster = 1; cluster < k; cluster++ )
329  {
330  const double distance = point.point.sqrDist( centers[cluster] );
331  if ( distance < currentDistance )
332  {
333  currentDistance = distance;
334  currentCluster = cluster;
335  }
336  }
337 
338  // Store the nearest cluster this object is in
339  if ( point.cluster != currentCluster )
340  {
341  changed = true;
342  point.cluster = currentCluster;
343  }
344  }
345 }
346 
347 // ported from https://github.com/postgis/postgis/blob/svn-trunk/liblwgeom/lwkmeans.c
348 
349 void QgsKMeansClusteringAlgorithm::updateMeans( const std::vector<Feature> &points, std::vector<QgsPointXY> &centers, std::vector<uint> &weights, const int k )
350 {
351  uint n = points.size();
352  std::fill( weights.begin(), weights.end(), 0 );
353  for ( int i = 0; i < k; i++ )
354  {
355  centers[i].setX( 0.0 );
356  centers[i].setY( 0.0 );
357  }
358  for ( uint i = 0; i < n; i++ )
359  {
360  int cluster = points[i].cluster;
361  centers[cluster] += QgsVector( points[i].point.x(),
362  points[i].point.y() );
363  weights[cluster] += 1;
364  }
365  for ( int i = 0; i < k; i++ )
366  {
367  centers[i] /= weights[i];
368  }
369 }
370 
371 
373 
374 
QgsFeatureId id
Definition: qgsfeature.h:64
Wrapper for iterator of features from vector data provider or vector layer.
Use faster inserts, at the cost of updating the passed features to reflect changes made at the provid...
Base class for providing feedback from a processing algorithm.
QgsWkbTypes::Type wkbType() const
Returns type of the geometry as a WKB type (point / linestring / polygon etc.)
A class to represent a 2D point.
Definition: qgspointxy.h:43
bool qgsDoubleNear(double a, double b, double epsilon=4 *std::numeric_limits< double >::epsilon())
Compare two doubles (but allow some difference)
Definition: qgis.h:280
void setProgress(double progress)
Sets the current progress for the feedback object.
Definition: qgsfeedback.h:63
Container of fields for a vector layer.
Definition: qgsfields.h:42
A geometry is the spatial representation of a feature.
Definition: qgsgeometry.h:122
void setAttributes(const QgsAttributes &attrs)
Sets the feature&#39;s attributes.
Definition: qgsfeature.cpp:127
QgsGeometry centroid() const
Returns the center of mass of a geometry.
The feature class encapsulates a single feature including its id, geometry and a list of field/values...
Definition: qgsfeature.h:55
bool hasGeometry() const
Returns true if the feature has an associated geometry.
Definition: qgsfeature.cpp:197
A feature sink output for processing algorithms.
double sqrDist(double x, double y) const
Returns the squared distance between this point a specified x, y coordinate.
Definition: qgspointxy.h:175
virtual void pushInfo(const QString &info)
Pushes a general informational message from the algorithm.
This class wraps a request for features to a vector layer (or directly its vector data provider)...
Custom exception class for processing related exceptions.
Definition: qgsexception.h:82
bool append(const QgsField &field, FieldOrigin origin=OriginProvider, int originIndex=-1)
Appends a field. The field must have unique name, otherwise it is rejected (returns false) ...
Definition: qgsfields.cpp:59
Encapsulate a field in an attribute table or data source.
Definition: qgsfield.h:48
A numeric parameter for processing algorithms.
A class to represent a vector.
Definition: qgsvector.h:29
const QgsAbstractGeometry * constGet() const
Returns a non-modifiable (const) reference to the underlying abstract geometry primitive.
bool isCanceled() const
Tells whether the operation has been canceled already.
Definition: qgsfeedback.h:54
An input feature source (such as vector layers) parameter for processing algorithms.
virtual void reportError(const QString &error, bool fatalError=false)
Reports that the algorithm encountered an error while executing.
QgsGeometry geometry
Definition: qgsfeature.h:67
bool nextFeature(QgsFeature &f)
static QgsFields combineFields(const QgsFields &fieldsA, const QgsFields &fieldsB, const QString &fieldsBPrefix=QString())
Combines two field lists, avoiding duplicate field names (in a case-insensitive manner).
A vector of attributes.
Definition: qgsattributes.h:57
static Type flatType(Type type)
Returns the flat type for a WKB type.
Definition: qgswkbtypes.h:576
Contains information about the context in which a processing algorithm is executed.
A string parameter for processing algorithms.
Any vector layer with geometry.
Definition: qgsprocessing.h:47
QgsAttributes attributes
Definition: qgsfeature.h:65