QGIS API Documentation  3.4.15-Madeira (e83d02e274)
qgsalgorithmkmeansclustering.cpp
Go to the documentation of this file.
1 /***************************************************************************
2  qgsalgorithmkmeansclustering.cpp
3  ---------------------
4  begin : June 2018
5  copyright : (C) 2018 by Nyall Dawson
6  email : nyall dot dawson at gmail dot com
7  ***************************************************************************/
8 
9 /***************************************************************************
10  * *
11  * This program is free software; you can redistribute it and/or modify *
12  * it under the terms of the GNU General Public License as published by *
13  * the Free Software Foundation; either version 2 of the License, or *
14  * (at your option) any later version. *
15  * *
16  ***************************************************************************/
17 
19 
21 
22 const int KMEANS_MAX_ITERATIONS = 1000;
23 
24 QString QgsKMeansClusteringAlgorithm::name() const
25 {
26  return QStringLiteral( "kmeansclustering" );
27 }
28 
29 QString QgsKMeansClusteringAlgorithm::displayName() const
30 {
31  return QObject::tr( "K-means clustering" );
32 }
33 
34 QStringList QgsKMeansClusteringAlgorithm::tags() const
35 {
36  return QObject::tr( "clustering,clusters,kmeans,points" ).split( ',' );
37 }
38 
39 QString QgsKMeansClusteringAlgorithm::group() const
40 {
41  return QObject::tr( "Vector analysis" );
42 }
43 
44 QString QgsKMeansClusteringAlgorithm::groupId() const
45 {
46  return QStringLiteral( "vectoranalysis" );
47 }
48 
49 void QgsKMeansClusteringAlgorithm::initAlgorithm( const QVariantMap & )
50 {
51  addParameter( new QgsProcessingParameterFeatureSource( QStringLiteral( "INPUT" ),
52  QObject::tr( "Input layer" ), QList< int >() << QgsProcessing::TypeVectorAnyGeometry ) );
53  addParameter( new QgsProcessingParameterNumber( QStringLiteral( "CLUSTERS" ), QObject::tr( "Number of clusters" ),
55  addParameter( new QgsProcessingParameterString( QStringLiteral( "FIELD_NAME" ),
56  QObject::tr( "Cluster field name" ), QStringLiteral( "CLUSTER_ID" ) ) );
57  addParameter( new QgsProcessingParameterFeatureSink( QStringLiteral( "OUTPUT" ), QObject::tr( "Clusters" ), QgsProcessing::TypeVectorAnyGeometry ) );
58 }
59 
60 QString QgsKMeansClusteringAlgorithm::shortHelpString() const
61 {
62  return QObject::tr( "Calculates the 2D distance based k-means cluster number for each input feature.\n\n"
63  "If input geometries are lines or polygons, the clustering is based on the centroid of the feature." );
64 }
65 
66 QgsKMeansClusteringAlgorithm *QgsKMeansClusteringAlgorithm::createInstance() const
67 {
68  return new QgsKMeansClusteringAlgorithm();
69 }
70 
71 QVariantMap QgsKMeansClusteringAlgorithm::processAlgorithm( const QVariantMap &parameters, QgsProcessingContext &context, QgsProcessingFeedback *feedback )
72 {
73  std::unique_ptr< QgsProcessingFeatureSource > source( parameterAsSource( parameters, QStringLiteral( "INPUT" ), context ) );
74  if ( !source )
75  throw QgsProcessingException( invalidSourceError( parameters, QStringLiteral( "INPUT" ) ) );
76 
77  int k = parameterAsInt( parameters, QStringLiteral( "CLUSTERS" ), context );
78 
79  QgsFields outputFields = source->fields();
80  const QString clusterFieldName = parameterAsString( parameters, QStringLiteral( "FIELD_NAME" ), context );
81  QgsFields newFields;
82  newFields.append( QgsField( clusterFieldName, QVariant::Int ) );
83  outputFields = QgsProcessingUtils::combineFields( outputFields, newFields );
84 
85  QString dest;
86  std::unique_ptr< QgsFeatureSink > sink( parameterAsSink( parameters, QStringLiteral( "OUTPUT" ), context, dest, outputFields, source->wkbType(), source->sourceCrs() ) );
87  if ( !sink )
88  throw QgsProcessingException( invalidSinkError( parameters, QStringLiteral( "OUTPUT" ) ) );
89 
90  // build list of point inputs - if it's already a point, use that. If not, take the centroid.
91  feedback->pushInfo( QObject::tr( "Collecting input points" ) );
92  double step = source->featureCount() > 0 ? 50.0 / source->featureCount() : 1;
93  int i = 0;
94  int n = 0;
95  QgsFeature feat;
96 
97  std::vector< Feature > clusterFeatures;
98  QgsFeatureIterator features = source->getFeatures( QgsFeatureRequest().setNoAttributes() );
99  QHash< QgsFeatureId, int > idToObj;
100  while ( features.nextFeature( feat ) )
101  {
102  i++;
103  if ( feedback->isCanceled() )
104  {
105  break;
106  }
107 
108  feedback->setProgress( i * step );
109  if ( !feat.hasGeometry() )
110  continue;
111 
112  QgsPointXY point;
114  point = QgsPointXY( *qgsgeometry_cast< const QgsPoint * >( feat.geometry().constGet() ) );
115  else
116  {
117  QgsGeometry centroid = feat.geometry().centroid();
118  if ( centroid.isNull() )
119  continue; // centroid failed, e.g. empty linestring
120 
121  point = QgsPointXY( *qgsgeometry_cast< const QgsPoint * >( centroid.constGet() ) );
122  }
123 
124  n++;
125 
126  idToObj.insert( feat.id(), clusterFeatures.size() );
127  clusterFeatures.emplace_back( Feature( point ) );
128  }
129 
130  if ( n < k )
131  {
132  feedback->reportError( QObject::tr( "Number of geometries is less than the number of clusters requested, not all clusters will get data" ) );
133  k = n;
134  }
135 
136  if ( k > 1 )
137  {
138  feedback->pushInfo( QObject::tr( "Calculating clusters" ) );
139 
140  // cluster centers
141  std::vector< QgsPointXY > centers( k );
142 
143  initClusters( clusterFeatures, centers, k, feedback );
144  calculateKMeans( clusterFeatures, centers, k, feedback );
145  }
146 
147  features = source->getFeatures();
148  i = 0;
149  while ( features.nextFeature( feat ) )
150  {
151  i++;
152  if ( feedback->isCanceled() )
153  {
154  break;
155  }
156 
157  feedback->setProgress( 50 + i * step );
158  QgsAttributes attr = feat.attributes();
159  if ( !feat.hasGeometry() )
160  {
161  attr << QVariant();
162  }
163  else if ( k <= 1 )
164  {
165  attr << 0;
166  }
167  else if ( !idToObj.contains( feat.id() ) )
168  {
169  attr << QVariant();
170  }
171  else
172  {
173  attr << clusterFeatures[ idToObj.value( feat.id() ) ].cluster;
174  }
175  feat.setAttributes( attr );
176  sink->addFeature( feat, QgsFeatureSink::FastInsert );
177  }
178 
179  QVariantMap outputs;
180  outputs.insert( QStringLiteral( "OUTPUT" ), dest );
181  return outputs;
182 }
183 
184 // ported from https://github.com/postgis/postgis/blob/svn-trunk/liblwgeom/lwkmeans.c
185 
186 void QgsKMeansClusteringAlgorithm::initClusters( std::vector<Feature> &points, std::vector<QgsPointXY> &centers, const int k, QgsProcessingFeedback *feedback )
187 {
188  std::size_t n = points.size();
189  if ( n == 0 )
190  return;
191 
192  if ( n == 1 )
193  {
194  for ( int i = 0; i < k; i++ )
195  centers[ i ] = points[ 0 ].point;
196  return;
197  }
198 
199  long duplicateCount = 1;
200  // initially scan for two most distance points from each other, p1 and p2
201  std::size_t p1 = 0;
202  std::size_t p2 = 0;
203  double distanceP1 = 0;
204  double distanceP2 = 0;
205  double maxDistance = -1;
206  for ( std::size_t i = 1; i < n; i++ )
207  {
208  distanceP1 = points[i].point.sqrDist( points[p1].point );
209  distanceP2 = points[i].point.sqrDist( points[p2].point );
210 
211  // if this point is further then existing candidates, replace our choice
212  if ( ( distanceP1 > maxDistance ) || ( distanceP2 > maxDistance ) )
213  {
214  maxDistance = std::max( distanceP1, distanceP2 );
215  if ( distanceP1 > distanceP2 )
216  p2 = i;
217  else
218  p1 = i;
219  }
220 
221  // also record count of duplicate points
222  if ( qgsDoubleNear( distanceP1, 0 ) || qgsDoubleNear( distanceP2, 0 ) )
223  duplicateCount++;
224  }
225 
226  if ( feedback && duplicateCount > 1 )
227  {
228  feedback->pushInfo( QObject::tr( "There are at least %1 duplicate inputs, the number of output clusters may be less than was requested" ).arg( duplicateCount ) );
229  }
230 
231  // By now two points should be found and be not the same
232  // Q_ASSERT( p1 != p2 && maxDistance >= 0 );
233 
234  // Accept these two points as our initial centers
235  centers[0] = points[p1].point;
236  centers[1] = points[p2].point;
237 
238  if ( k > 2 )
239  {
240  // array of minimum distance to a point from accepted cluster centers
241  std::vector< double > distances( n );
242 
243  // initialize array with distance to first object
244  for ( std::size_t j = 0; j < n; j++ )
245  {
246  distances[j] = points[j].point.sqrDist( centers[0] );
247  }
248  distances[p1] = -1;
249  distances[p2] = -1;
250 
251  // loop i on clusters, skip 0 and 1 as found already
252  for ( int i = 2; i < k; i++ )
253  {
254  std::size_t candidateCenter = 0;
255  double maxDistance = std::numeric_limits<double>::lowest();
256 
257  // loop j on points
258  for ( std::size_t j = 0; j < n; j++ )
259  {
260  // accepted clusters are already marked with distance = -1
261  if ( distances[j] < 0 )
262  continue;
263 
264  // update minimal distance with previously accepted cluster
265  distances[j] = std::min( points[j].point.sqrDist( centers[i - 1] ), distances[j] );
266 
267  // greedily take a point that's farthest from any of accepted clusters
268  if ( distances[j] > maxDistance )
269  {
270  candidateCenter = j;
271  maxDistance = distances[j];
272  }
273  }
274 
275  // checked earlier by counting entries on input, just in case
276  Q_ASSERT( maxDistance >= 0 );
277 
278  // accept candidate to centers
279  distances[candidateCenter] = -1;
280  // copy the point coordinates into the initial centers array
281  centers[i] = points[candidateCenter].point;
282  }
283  }
284 }
285 
286 // ported from https://github.com/postgis/postgis/blob/svn-trunk/liblwgeom/lwkmeans.c
287 
288 void QgsKMeansClusteringAlgorithm::calculateKMeans( std::vector<QgsKMeansClusteringAlgorithm::Feature> &objs, std::vector<QgsPointXY> &centers, int k, QgsProcessingFeedback *feedback )
289 {
290  int converged = false;
291  bool changed = false;
292 
293  // avoid reallocating weights array for every iteration
294  std::vector< uint > weights( k );
295 
296  uint i = 0;
297  for ( i = 0; i < KMEANS_MAX_ITERATIONS && !converged; i++ )
298  {
299  if ( feedback && feedback->isCanceled() )
300  break;
301 
302  findNearest( objs, centers, k, changed );
303  updateMeans( objs, centers, weights, k );
304  converged = !changed;
305  }
306 
307  if ( !converged && feedback )
308  feedback->reportError( QObject::tr( "Clustering did not converge after %1 iterations" ).arg( i ) );
309  else if ( feedback )
310  feedback->pushInfo( QObject::tr( "Clustering converged after %1 iterations" ).arg( i ) );
311 }
312 
313 // ported from https://github.com/postgis/postgis/blob/svn-trunk/liblwgeom/lwkmeans.c
314 
315 void QgsKMeansClusteringAlgorithm::findNearest( std::vector<QgsKMeansClusteringAlgorithm::Feature> &points, const std::vector<QgsPointXY> &centers, const int k, bool &changed )
316 {
317  changed = false;
318  std::size_t n = points.size();
319  for ( std::size_t i = 0; i < n; i++ )
320  {
321  Feature &point = points[i];
322 
323  // Initialize with distance to first cluster
324  double currentDistance = point.point.sqrDist( centers[0] );
325  int currentCluster = 0;
326 
327  // Check all other cluster centers and find the nearest
328  for ( int cluster = 1; cluster < k; cluster++ )
329  {
330  const double distance = point.point.sqrDist( centers[cluster] );
331  if ( distance < currentDistance )
332  {
333  currentDistance = distance;
334  currentCluster = cluster;
335  }
336  }
337 
338  // Store the nearest cluster this object is in
339  if ( point.cluster != currentCluster )
340  {
341  changed = true;
342  point.cluster = currentCluster;
343  }
344  }
345 }
346 
347 // ported from https://github.com/postgis/postgis/blob/svn-trunk/liblwgeom/lwkmeans.c
348 
349 void QgsKMeansClusteringAlgorithm::updateMeans( const std::vector<Feature> &points, std::vector<QgsPointXY> &centers, std::vector<uint> &weights, const int k )
350 {
351  uint n = points.size();
352  std::fill( weights.begin(), weights.end(), 0 );
353  for ( int i = 0; i < k; i++ )
354  {
355  centers[i].setX( 0.0 );
356  centers[i].setY( 0.0 );
357  }
358  for ( uint i = 0; i < n; i++ )
359  {
360  int cluster = points[i].cluster;
361  centers[cluster] += QgsVector( points[i].point.x(),
362  points[i].point.y() );
363  weights[cluster] += 1;
364  }
365  for ( int i = 0; i < k; i++ )
366  {
367  centers[i] /= weights[i];
368  }
369 }
370 
371 
373 
374 
QgsFeatureId id
Definition: qgsfeature.h:64
Wrapper for iterator of features from vector data provider or vector layer.
bool isCanceled() const
Tells whether the operation has been canceled already.
Definition: qgsfeedback.h:54
Use faster inserts, at the cost of updating the passed features to reflect changes made at the provid...
Base class for providing feedback from a processing algorithm.
bool isNull() const
Returns true if the geometry is null (ie, contains no underlying geometry accessible via geometry() )...
A class to represent a 2D point.
Definition: qgspointxy.h:43
bool qgsDoubleNear(double a, double b, double epsilon=4 *std::numeric_limits< double >::epsilon())
Compare two doubles (but allow some difference)
Definition: qgis.h:278
void setProgress(double progress)
Sets the current progress for the feedback object.
Definition: qgsfeedback.h:63
double sqrDist(double x, double y) const
Returns the squared distance between this point a specified x, y coordinate.
Definition: qgspointxy.h:169
Container of fields for a vector layer.
Definition: qgsfields.h:42
A geometry is the spatial representation of a feature.
Definition: qgsgeometry.h:106
void setAttributes(const QgsAttributes &attrs)
Sets the feature&#39;s attributes.
Definition: qgsfeature.cpp:127
The feature class encapsulates a single feature including its id, geometry and a list of field/values...
Definition: qgsfeature.h:55
A feature sink output for processing algorithms.
const QgsAbstractGeometry * constGet() const
Returns a non-modifiable (const) reference to the underlying abstract geometry primitive.
static QgsFields combineFields(const QgsFields &fieldsA, const QgsFields &fieldsB)
Combines two field lists, avoiding duplicate field names (in a case-insensitive manner).
This class wraps a request for features to a vector layer (or directly its vector data provider)...
Custom exception class for processing related exceptions.
Definition: qgsexception.h:82
bool append(const QgsField &field, FieldOrigin origin=OriginProvider, int originIndex=-1)
Appends a field. The field must have unique name, otherwise it is rejected (returns false) ...
Definition: qgsfields.cpp:59
Encapsulate a field in an attribute table or data source.
Definition: qgsfield.h:48
A numeric parameter for processing algorithms.
A class to represent a vector.
Definition: qgsvector.h:28
An input feature source (such as vector layers) parameter for processing algorithms.
bool hasGeometry() const
Returns true if the feature has an associated geometry.
Definition: qgsfeature.cpp:197
QgsGeometry geometry
Definition: qgsfeature.h:67
bool nextFeature(QgsFeature &f)
A vector of attributes.
Definition: qgsattributes.h:57
static Type flatType(Type type)
Returns the flat type for a WKB type.
Definition: qgswkbtypes.h:565
Contains information about the context in which a processing algorithm is executed.
QgsWkbTypes::Type wkbType() const
Returns type of the geometry as a WKB type (point / linestring / polygon etc.)
virtual void reportError(const QString &error, bool fatalError=false)
Reports that the algorithm encountered an error while executing.
A string parameter for processing algorithms.
Any vector layer with geometry.
Definition: qgsprocessing.h:47
QgsAttributes attributes
Definition: qgsfeature.h:65
virtual void pushInfo(const QString &info)
Pushes a general informational message from the algorithm.
QgsGeometry centroid() const
Returns the center of mass of a geometry.