QGIS API Documentation  2.18.21-Las Palmas (9fba24a)
qgsstringutils.cpp
Go to the documentation of this file.
1 /***************************************************************************
2  qgsstringutils.cpp
3  ------------------
4  begin : June 2015
5  copyright : (C) 2015 by Nyall Dawson
6  email : nyall dot dawson at gmail dot com
7  ***************************************************************************
8  * *
9  * This program is free software; you can redistribute it and/or modify *
10  * it under the terms of the GNU General Public License as published by *
11  * the Free Software Foundation; either version 2 of the License, or *
12  * (at your option) any later version. *
13  * *
14  ***************************************************************************/
15 
16 #include "qgsstringutils.h"
17 #include <QVector>
18 #include <QRegExp>
19 #include <QTextDocument> // for Qt::escape
20 #include <QStringList>
21 #include <QTextBoundaryFinder>
22 
24 {
25  if ( string.isEmpty() )
26  return QString();
27 
28  switch ( capitalization )
29  {
30  case MixedCase:
31  return string;
32 
33  case AllUppercase:
34  return string.toUpper();
35 
36  case AllLowercase:
37  return string.toLower();
38 
40  {
41  QString temp = string;
42 
43  QTextBoundaryFinder wordSplitter( QTextBoundaryFinder::Word, string.constData(), string.length(), 0, 0 );
44  QTextBoundaryFinder letterSplitter( QTextBoundaryFinder::Grapheme, string.constData(), string.length(), 0, 0 );
45 
46  wordSplitter.setPosition( 0 );
47  bool first = true;
48 #if QT_VERSION >= 0x050000
49  while (( first && wordSplitter.boundaryReasons() & QTextBoundaryFinder::StartOfItem )
50  || wordSplitter.toNextBoundary() >= 0 )
51 #else
52  while (( first && wordSplitter.boundaryReasons() & QTextBoundaryFinder::StartWord )
53  || wordSplitter.toNextBoundary() >= 0 )
54 #endif
55  {
56  first = false;
57  letterSplitter.setPosition( wordSplitter.position() );
58  letterSplitter.toNextBoundary();
59  QString substr = string.mid( wordSplitter.position(), letterSplitter.position() - wordSplitter.position() );
60  temp.replace( wordSplitter.position(), substr.length(), substr.toUpper() );
61  }
62  return temp;
63  }
64 
65  }
66  // no warnings
67  return string;
68 }
69 
70 int QgsStringUtils::levenshteinDistance( const QString& string1, const QString& string2, bool caseSensitive )
71 {
72  int length1 = string1.length();
73  int length2 = string2.length();
74 
75  //empty strings? solution is trivial...
76  if ( string1.isEmpty() )
77  {
78  return length2;
79  }
80  else if ( string2.isEmpty() )
81  {
82  return length1;
83  }
84 
85  //handle case sensitive flag (or not)
86  QString s1( caseSensitive ? string1 : string1.toLower() );
87  QString s2( caseSensitive ? string2 : string2.toLower() );
88 
89  const QChar* s1Char = s1.constData();
90  const QChar* s2Char = s2.constData();
91 
92  //strip out any common prefix
93  int commonPrefixLen = 0;
94  while ( length1 > 0 && length2 > 0 && *s1Char == *s2Char )
95  {
96  commonPrefixLen++;
97  length1--;
98  length2--;
99  s1Char++;
100  s2Char++;
101  }
102 
103  //strip out any common suffix
104  while ( length1 > 0 && length2 > 0 && s1.at( commonPrefixLen + length1 - 1 ) == s2.at( commonPrefixLen + length2 - 1 ) )
105  {
106  length1--;
107  length2--;
108  }
109 
110  //fully checked either string? if so, the answer is easy...
111  if ( length1 == 0 )
112  {
113  return length2;
114  }
115  else if ( length2 == 0 )
116  {
117  return length1;
118  }
119 
120  //ensure the inner loop is longer
121  if ( length1 > length2 )
122  {
123  qSwap( s1, s2 );
124  qSwap( length1, length2 );
125  }
126 
127  //levenshtein algorithm begins here
128  QVector< int > col;
129  col.fill( 0, length2 + 1 );
130  QVector< int > prevCol;
131  prevCol.reserve( length2 + 1 );
132  for ( int i = 0; i < length2 + 1; ++i )
133  {
134  prevCol << i;
135  }
136  const QChar* s2start = s2Char;
137  for ( int i = 0; i < length1; ++i )
138  {
139  col[0] = i + 1;
140  s2Char = s2start;
141  for ( int j = 0; j < length2; ++j )
142  {
143  col[j + 1] = qMin( qMin( 1 + col[j], 1 + prevCol[1 + j] ), prevCol[j] + (( *s1Char == *s2Char ) ? 0 : 1 ) );
144  s2Char++;
145  }
146  col.swap( prevCol );
147  s1Char++;
148  }
149  return prevCol[length2];
150 }
151 
152 QString QgsStringUtils::longestCommonSubstring( const QString& string1, const QString& string2, bool caseSensitive )
153 {
154  if ( string1.isEmpty() || string2.isEmpty() )
155  {
156  //empty strings, solution is trivial...
157  return QString();
158  }
159 
160  //handle case sensitive flag (or not)
161  QString s1( caseSensitive ? string1 : string1.toLower() );
162  QString s2( caseSensitive ? string2 : string2.toLower() );
163 
164  if ( s1 == s2 )
165  {
166  //another trivial case, identical strings
167  return s1;
168  }
169 
170  int* currentScores = new int [ s2.length()];
171  int* previousScores = new int [ s2.length()];
172  int maxCommonLength = 0;
173  int lastMaxBeginIndex = 0;
174 
175  const QChar* s1Char = s1.constData();
176  const QChar* s2Char = s2.constData();
177  const QChar* s2Start = s2Char;
178 
179  for ( int i = 0; i < s1.length(); ++i )
180  {
181  for ( int j = 0; j < s2.length(); ++j )
182  {
183  if ( *s1Char != *s2Char )
184  {
185  currentScores[j] = 0;
186  }
187  else
188  {
189  if ( i == 0 || j == 0 )
190  {
191  currentScores[j] = 1;
192  }
193  else
194  {
195  currentScores[j] = 1 + previousScores[j - 1];
196  }
197 
198  if ( maxCommonLength < currentScores[j] )
199  {
200  maxCommonLength = currentScores[j];
201  lastMaxBeginIndex = i;
202  }
203  }
204  s2Char++;
205  }
206  qSwap( currentScores, previousScores );
207  s1Char++;
208  s2Char = s2Start;
209  }
210  delete [] currentScores;
211  delete [] previousScores;
212  return string1.mid( lastMaxBeginIndex - maxCommonLength + 1, maxCommonLength );
213 }
214 
215 int QgsStringUtils::hammingDistance( const QString& string1, const QString& string2, bool caseSensitive )
216 {
217  if ( string1.isEmpty() && string2.isEmpty() )
218  {
219  //empty strings, solution is trivial...
220  return 0;
221  }
222 
223  if ( string1.length() != string2.length() )
224  {
225  //invalid inputs
226  return -1;
227  }
228 
229  //handle case sensitive flag (or not)
230  QString s1( caseSensitive ? string1 : string1.toLower() );
231  QString s2( caseSensitive ? string2 : string2.toLower() );
232 
233  if ( s1 == s2 )
234  {
235  //another trivial case, identical strings
236  return 0;
237  }
238 
239  int distance = 0;
240  const QChar* s1Char = s1.constData();
241  const QChar* s2Char = s2.constData();
242 
243  for ( int i = 0; i < string1.length(); ++i )
244  {
245  if ( *s1Char != *s2Char )
246  distance++;
247  s1Char++;
248  s2Char++;
249  }
250 
251  return distance;
252 }
253 
255 {
256  if ( string.isEmpty() )
257  return QString();
258 
259  QString tmp = string.toUpper();
260 
261  //strip non character codes, and vowel like characters after the first character
262  QChar* char1 = tmp.data();
263  QChar* char2 = tmp.data();
264  int outLen = 0;
265  for ( int i = 0; i < tmp.length(); ++i, ++char2 )
266  {
267  if (( *char2 ).unicode() >= 0x41 && ( *char2 ).unicode() <= 0x5A && ( i == 0 || (( *char2 ).unicode() != 0x41 && ( *char2 ).unicode() != 0x45
268  && ( *char2 ).unicode() != 0x48 && ( *char2 ).unicode() != 0x49
269  && ( *char2 ).unicode() != 0x4F && ( *char2 ).unicode() != 0x55
270  && ( *char2 ).unicode() != 0x57 && ( *char2 ).unicode() != 0x59 ) ) )
271  {
272  *char1 = *char2;
273  char1++;
274  outLen++;
275  }
276  }
277  tmp.truncate( outLen );
278 
279  QChar* tmpChar = tmp.data();
280  tmpChar++;
281  for ( int i = 1; i < tmp.length(); ++i, ++tmpChar )
282  {
283  switch (( *tmpChar ).unicode() )
284  {
285  case 0x42:
286  case 0x46:
287  case 0x50:
288  case 0x56:
289  tmp.replace( i, 1, QChar( 0x31 ) );
290  break;
291 
292  case 0x43:
293  case 0x47:
294  case 0x4A:
295  case 0x4B:
296  case 0x51:
297  case 0x53:
298  case 0x58:
299  case 0x5A:
300  tmp.replace( i, 1, QChar( 0x32 ) );
301  break;
302 
303  case 0x44:
304  case 0x54:
305  tmp.replace( i, 1, QChar( 0x33 ) );
306  break;
307 
308  case 0x4C:
309  tmp.replace( i, 1, QChar( 0x34 ) );
310  break;
311 
312  case 0x4D:
313  case 0x4E:
314  tmp.replace( i, 1, QChar( 0x35 ) );
315  break;
316 
317  case 0x52:
318  tmp.replace( i, 1, QChar( 0x36 ) );
319  break;
320  }
321  }
322 
323  //remove adjacent duplicates
324  char1 = tmp.data();
325  char2 = tmp.data();
326  char2++;
327  outLen = 1;
328  for ( int i = 1; i < tmp.length(); ++i, ++char2 )
329  {
330  if ( *char2 != *char1 )
331  {
332  char1++;
333  *char1 = *char2;
334  outLen++;
335  if ( outLen == 4 )
336  break;
337  }
338  }
339  tmp.truncate( outLen );
340  if ( tmp.length() < 4 )
341  {
342  tmp.append( "000" );
343  tmp.truncate( 4 );
344  }
345 
346  return tmp;
347 }
348 
349 QString QgsStringUtils::insertLinks( const QString& string, bool *foundLinks )
350 {
351  QString converted = string;
352 
353  // http://alanstorm.com/url_regex_explained
354  // note - there's more robust implementations available, but we need one which works within the limitation of QRegExp
355  static QRegExp urlRegEx( "(\\b(([\\w-]+://?|www[.])[^\\s()<>]+(?:\\([\\w\\d]+\\)|([^!\"#$%&'()*+,\\-./:;<=>[email protected][\\\\\\]^_`{|}~\\s]|/))))" );
356  static QRegExp protoRegEx( "^(?:f|ht)tps?://" );
357  static QRegExp emailRegEx( "([\\w._%+-][email protected][\\w.-]+\\.[A-Za-z]+)" );
358 
359  int offset = 0;
360  bool found = false;
361  while ( urlRegEx.indexIn( converted, offset ) != -1 )
362  {
363  found = true;
364  QString url = urlRegEx.cap( 1 );
365  QString protoUrl = url;
366  if ( protoRegEx.indexIn( protoUrl ) == -1 )
367  {
368  protoUrl.prepend( "http://" );
369  }
370  QString anchor = QString( "<a href=\"%1\">%2</a>" ).arg( Qt::escape( protoUrl ) ).arg( Qt::escape( url ) );
371  converted.replace( urlRegEx.pos( 1 ), url.length(), anchor );
372  offset = urlRegEx.pos( 1 ) + anchor.length();
373  }
374  offset = 0;
375  while ( emailRegEx.indexIn( converted, offset ) != -1 )
376  {
377  found = true;
378  QString email = emailRegEx.cap( 1 );
379  QString anchor = QString( "<a href=\"mailto:%1\">%1</a>" ).arg( Qt::escape( email ) ).arg( Qt::escape( email ) );
380  converted.replace( emailRegEx.pos( 1 ), email.length(), anchor );
381  offset = emailRegEx.pos( 1 ) + anchor.length();
382  }
383 
384  if ( foundLinks )
385  *foundLinks = found;
386 
387  return converted;
388 }
389 
390 QgsStringReplacement::QgsStringReplacement( const QString& match, const QString& replacement, bool caseSensitive, bool wholeWordOnly )
391  : mMatch( match )
392  , mReplacement( replacement )
393  , mCaseSensitive( caseSensitive )
394  , mWholeWordOnly( wholeWordOnly )
395 {
396  if ( mWholeWordOnly )
397  mRx = QRegExp( QString( "\\b%1\\b" ).arg( mMatch ),
398  mCaseSensitive ? Qt::CaseSensitive : Qt::CaseInsensitive );
399 }
400 
402 {
403  QString result = input;
404  if ( !mWholeWordOnly )
405  {
406  return result.replace( mMatch, mReplacement, mCaseSensitive ? Qt::CaseSensitive : Qt::CaseInsensitive );
407  }
408  else
409  {
410  return result.replace( mRx, mReplacement );
411  }
412 }
413 
415 {
416  QgsStringMap map;
417  map.insert( "match", mMatch );
418  map.insert( "replace", mReplacement );
419  map.insert( "caseSensitive", mCaseSensitive ? "1" : "0" );
420  map.insert( "wholeWord", mWholeWordOnly ? "1" : "0" );
421  return map;
422 }
423 
425 {
426  return QgsStringReplacement( properties.value( "match" ),
427  properties.value( "replace" ),
428  properties.value( "caseSensitive", "0" ) == "1",
429  properties.value( "wholeWord", "0" ) == "1" );
430 }
431 
433 {
434  QString result = input;
435  Q_FOREACH ( const QgsStringReplacement& r, mReplacements )
436  {
437  result = r.process( result );
438  }
439  return result;
440 }
441 
443 {
444  Q_FOREACH ( const QgsStringReplacement& r, mReplacements )
445  {
446  QgsStringMap props = r.properties();
447  QDomElement propEl = doc.createElement( "replacement" );
449  for ( ; it != props.constEnd(); ++it )
450  {
451  propEl.setAttribute( it.key(), it.value() );
452  }
453  elem.appendChild( propEl );
454  }
455 }
456 
458 {
459  mReplacements.clear();
460  QDomNodeList nodelist = elem.elementsByTagName( "replacement" );
461  for ( int i = 0;i < nodelist.count(); i++ )
462  {
463  QDomElement replacementElem = nodelist.at( i ).toElement();
464  QDomNamedNodeMap nodeMap = replacementElem.attributes();
465 
466  QgsStringMap props;
467  for ( int j = 0; j < nodeMap.count(); ++j )
468  {
469  props.insert( nodeMap.item( j ).nodeName(), nodeMap.item( j ).nodeValue() );
470  }
471  mReplacements << QgsStringReplacement::fromProperties( props );
472  }
473 
474 }
int pos(int nth) const
static QString longestCommonSubstring(const QString &string1, const QString &string2, bool caseSensitive=false)
Returns the longest common substring between two strings.
QDomNodeList elementsByTagName(const QString &tagname) const
QString cap(int nth) const
QString & append(QChar ch)
QString toUpper() const
void truncate(int position)
QDomNode appendChild(const QDomNode &newChild)
A representation of a single string replacement.
QString nodeValue() const
QString & prepend(QChar ch)
QVector< T > & fill(const T &value, int size)
const_iterator constBegin() const
static QString soundex(const QString &string)
Returns the Soundex representation of a string.
void writeXml(QDomElement &elem, QDomDocument &doc) const
Writes the collection state to an XML element.
QDomElement toElement() const
int indexIn(const QString &str, int offset, CaretMode caretMode) const
static QgsStringReplacement fromProperties(const QgsStringMap &properties)
Creates a new QgsStringReplacement from an encoded properties map.
int count() const
static QString capitalize(const QString &string, Capitalization capitalization)
Converts a string by applying capitalization rules to the string.
const Key & key() const
void setAttribute(const QString &name, const QString &value)
QString nodeName() const
bool isEmpty() const
const_iterator constEnd() const
int count() const
void setPosition(int position)
const T & value() const
static int levenshteinDistance(const QString &string1, const QString &string2, bool caseSensitive=false)
Returns the Levenshtein edit distance between two strings.
Convert just the first letter of each word to uppercase, leave the rest untouched.
Convert all characters to uppercase.
QString toLower() const
Capitalization
Capitalization options.
void reserve(int size)
QgsStringMap properties() const
Returns a map of the replacement properties.
QString & replace(int position, int n, QChar after)
QString process(const QString &input) const
Processes a given input string, applying any valid replacements which should be made.
QString mid(int position, int n) const
Mixed case, ie no change.
QString escape(const QString &plain)
int length() const
int position() const
void swap(QVector< T > &other)
iterator insert(const Key &key, const T &value)
Convert all characters to lowercase.
void readXml(const QDomElement &elem)
Reads the collection state from an XML element.
QString process(const QString &input) const
Processes a given input string, applying any valid replacements which should be made using QgsStringR...
BoundaryReasons boundaryReasons() const
QDomElement createElement(const QString &tagName)
QDomNode item(int index) const
QgsStringReplacement(const QString &match, const QString &replacement, bool caseSensitive=false, bool wholeWordOnly=false)
Constructor for QgsStringReplacement.
QChar * data()
QString arg(qlonglong a, int fieldWidth, int base, const QChar &fillChar) const
static QString insertLinks(const QString &string, bool *foundLinks=nullptr)
Returns a string with any URL (eg http(s)/ftp) and mailto: text converted to valid HTML <a ...
static int hammingDistance(const QString &string1, const QString &string2, bool caseSensitive=false)
Returns the Hamming distance between two strings.
QDomNode at(int index) const
const T value(const Key &key) const
QDomNamedNodeMap attributes() const