QGIS API Documentation  3.2.0-Bonn (bc43194)
qgsstringutils.cpp
Go to the documentation of this file.
1 /***************************************************************************
2  qgsstringutils.cpp
3  ------------------
4  begin : June 2015
5  copyright : (C) 2015 by Nyall Dawson
6  email : nyall dot dawson at gmail dot com
7  ***************************************************************************
8  * *
9  * This program is free software; you can redistribute it and/or modify *
10  * it under the terms of the GNU General Public License as published by *
11  * the Free Software Foundation; either version 2 of the License, or *
12  * (at your option) any later version. *
13  * *
14  ***************************************************************************/
15 
16 #include "qgsstringutils.h"
17 #include <QVector>
18 #include <QRegExp>
19 #include <QStringList>
20 #include <QTextBoundaryFinder>
21 #include <QRegularExpression>
22 
23 QString QgsStringUtils::capitalize( const QString &string, QgsStringUtils::Capitalization capitalization )
24 {
25  if ( string.isEmpty() )
26  return QString();
27 
28  switch ( capitalization )
29  {
30  case MixedCase:
31  return string;
32 
33  case AllUppercase:
34  return string.toUpper();
35 
36  case AllLowercase:
37  return string.toLower();
38 
40  {
41  QString temp = string;
42 
43  QTextBoundaryFinder wordSplitter( QTextBoundaryFinder::Word, string.constData(), string.length(), nullptr, 0 );
44  QTextBoundaryFinder letterSplitter( QTextBoundaryFinder::Grapheme, string.constData(), string.length(), nullptr, 0 );
45 
46  wordSplitter.setPosition( 0 );
47  bool first = true;
48  while ( ( first && wordSplitter.boundaryReasons() & QTextBoundaryFinder::StartOfItem )
49  || wordSplitter.toNextBoundary() >= 0 )
50  {
51  first = false;
52  letterSplitter.setPosition( wordSplitter.position() );
53  letterSplitter.toNextBoundary();
54  QString substr = string.mid( wordSplitter.position(), letterSplitter.position() - wordSplitter.position() );
55  temp.replace( wordSplitter.position(), substr.length(), substr.toUpper() );
56  }
57  return temp;
58  }
59 
60  case TitleCase:
61  {
62  // yes, this is MASSIVELY simplifying the problem!!
63 
64  static QStringList smallWords;
65  static QStringList newPhraseSeparators;
66  static QRegularExpression splitWords;
67  if ( smallWords.empty() )
68  {
69  smallWords = QObject::tr( "a|an|and|as|at|but|by|en|for|if|in|nor|of|on|or|per|s|the|to|vs.|vs|via" ).split( '|' );
70  newPhraseSeparators = QObject::tr( ".|:" ).split( '|' );
71  splitWords = QRegularExpression( QStringLiteral( "\\b" ), QRegularExpression::UseUnicodePropertiesOption );
72  }
73 
74  const QStringList parts = string.split( splitWords, QString::SkipEmptyParts );
75  QString result;
76  bool firstWord = true;
77  int i = 0;
78  int lastWord = parts.count() - 1;
79  for ( const QString &word : qgis::as_const( parts ) )
80  {
81  if ( newPhraseSeparators.contains( word.trimmed() ) )
82  {
83  firstWord = true;
84  result += word;
85  }
86  else if ( firstWord || ( i == lastWord ) || !smallWords.contains( word ) )
87  {
88  result += word.at( 0 ).toUpper() + word.mid( 1 );
89  firstWord = false;
90  }
91  else
92  {
93  result += word;
94  }
95  i++;
96  }
97  return result;
98  }
99  }
100  // no warnings
101  return string;
102 }
103 
104 // original code from http://www.qtcentre.org/threads/52456-HTML-Unicode-ampersand-encoding
105 QString QgsStringUtils::ampersandEncode( const QString &string )
106 {
107  QString encoded;
108  for ( int i = 0; i < string.size(); ++i )
109  {
110  QChar ch = string.at( i );
111  if ( ch.unicode() > 160 )
112  encoded += QStringLiteral( "&#%1;" ).arg( static_cast< int >( ch.unicode() ) );
113  else if ( ch.unicode() == 38 )
114  encoded += QStringLiteral( "&amp;" );
115  else if ( ch.unicode() == 60 )
116  encoded += QStringLiteral( "&lt;" );
117  else if ( ch.unicode() == 62 )
118  encoded += QStringLiteral( "&gt;" );
119  else
120  encoded += ch;
121  }
122  return encoded;
123 }
124 
125 int QgsStringUtils::levenshteinDistance( const QString &string1, const QString &string2, bool caseSensitive )
126 {
127  int length1 = string1.length();
128  int length2 = string2.length();
129 
130  //empty strings? solution is trivial...
131  if ( string1.isEmpty() )
132  {
133  return length2;
134  }
135  else if ( string2.isEmpty() )
136  {
137  return length1;
138  }
139 
140  //handle case sensitive flag (or not)
141  QString s1( caseSensitive ? string1 : string1.toLower() );
142  QString s2( caseSensitive ? string2 : string2.toLower() );
143 
144  const QChar *s1Char = s1.constData();
145  const QChar *s2Char = s2.constData();
146 
147  //strip out any common prefix
148  int commonPrefixLen = 0;
149  while ( length1 > 0 && length2 > 0 && *s1Char == *s2Char )
150  {
151  commonPrefixLen++;
152  length1--;
153  length2--;
154  s1Char++;
155  s2Char++;
156  }
157 
158  //strip out any common suffix
159  while ( length1 > 0 && length2 > 0 && s1.at( commonPrefixLen + length1 - 1 ) == s2.at( commonPrefixLen + length2 - 1 ) )
160  {
161  length1--;
162  length2--;
163  }
164 
165  //fully checked either string? if so, the answer is easy...
166  if ( length1 == 0 )
167  {
168  return length2;
169  }
170  else if ( length2 == 0 )
171  {
172  return length1;
173  }
174 
175  //ensure the inner loop is longer
176  if ( length1 > length2 )
177  {
178  std::swap( s1, s2 );
179  std::swap( length1, length2 );
180  }
181 
182  //levenshtein algorithm begins here
183  QVector< int > col;
184  col.fill( 0, length2 + 1 );
185  QVector< int > prevCol;
186  prevCol.reserve( length2 + 1 );
187  for ( int i = 0; i < length2 + 1; ++i )
188  {
189  prevCol << i;
190  }
191  const QChar *s2start = s2Char;
192  for ( int i = 0; i < length1; ++i )
193  {
194  col[0] = i + 1;
195  s2Char = s2start;
196  for ( int j = 0; j < length2; ++j )
197  {
198  col[j + 1] = std::min( std::min( 1 + col[j], 1 + prevCol[1 + j] ), prevCol[j] + ( ( *s1Char == *s2Char ) ? 0 : 1 ) );
199  s2Char++;
200  }
201  col.swap( prevCol );
202  s1Char++;
203  }
204  return prevCol[length2];
205 }
206 
207 QString QgsStringUtils::longestCommonSubstring( const QString &string1, const QString &string2, bool caseSensitive )
208 {
209  if ( string1.isEmpty() || string2.isEmpty() )
210  {
211  //empty strings, solution is trivial...
212  return QString();
213  }
214 
215  //handle case sensitive flag (or not)
216  QString s1( caseSensitive ? string1 : string1.toLower() );
217  QString s2( caseSensitive ? string2 : string2.toLower() );
218 
219  if ( s1 == s2 )
220  {
221  //another trivial case, identical strings
222  return s1;
223  }
224 
225  int *currentScores = new int [ s2.length()];
226  int *previousScores = new int [ s2.length()];
227  int maxCommonLength = 0;
228  int lastMaxBeginIndex = 0;
229 
230  const QChar *s1Char = s1.constData();
231  const QChar *s2Char = s2.constData();
232  const QChar *s2Start = s2Char;
233 
234  for ( int i = 0; i < s1.length(); ++i )
235  {
236  for ( int j = 0; j < s2.length(); ++j )
237  {
238  if ( *s1Char != *s2Char )
239  {
240  currentScores[j] = 0;
241  }
242  else
243  {
244  if ( i == 0 || j == 0 )
245  {
246  currentScores[j] = 1;
247  }
248  else
249  {
250  currentScores[j] = 1 + previousScores[j - 1];
251  }
252 
253  if ( maxCommonLength < currentScores[j] )
254  {
255  maxCommonLength = currentScores[j];
256  lastMaxBeginIndex = i;
257  }
258  }
259  s2Char++;
260  }
261  std::swap( currentScores, previousScores );
262  s1Char++;
263  s2Char = s2Start;
264  }
265  delete [] currentScores;
266  delete [] previousScores;
267  return string1.mid( lastMaxBeginIndex - maxCommonLength + 1, maxCommonLength );
268 }
269 
270 int QgsStringUtils::hammingDistance( const QString &string1, const QString &string2, bool caseSensitive )
271 {
272  if ( string1.isEmpty() && string2.isEmpty() )
273  {
274  //empty strings, solution is trivial...
275  return 0;
276  }
277 
278  if ( string1.length() != string2.length() )
279  {
280  //invalid inputs
281  return -1;
282  }
283 
284  //handle case sensitive flag (or not)
285  QString s1( caseSensitive ? string1 : string1.toLower() );
286  QString s2( caseSensitive ? string2 : string2.toLower() );
287 
288  if ( s1 == s2 )
289  {
290  //another trivial case, identical strings
291  return 0;
292  }
293 
294  int distance = 0;
295  const QChar *s1Char = s1.constData();
296  const QChar *s2Char = s2.constData();
297 
298  for ( int i = 0; i < string1.length(); ++i )
299  {
300  if ( *s1Char != *s2Char )
301  distance++;
302  s1Char++;
303  s2Char++;
304  }
305 
306  return distance;
307 }
308 
309 QString QgsStringUtils::soundex( const QString &string )
310 {
311  if ( string.isEmpty() )
312  return QString();
313 
314  QString tmp = string.toUpper();
315 
316  //strip non character codes, and vowel like characters after the first character
317  QChar *char1 = tmp.data();
318  QChar *char2 = tmp.data();
319  int outLen = 0;
320  for ( int i = 0; i < tmp.length(); ++i, ++char2 )
321  {
322  if ( ( *char2 ).unicode() >= 0x41 && ( *char2 ).unicode() <= 0x5A && ( i == 0 || ( ( *char2 ).unicode() != 0x41 && ( *char2 ).unicode() != 0x45
323  && ( *char2 ).unicode() != 0x48 && ( *char2 ).unicode() != 0x49
324  && ( *char2 ).unicode() != 0x4F && ( *char2 ).unicode() != 0x55
325  && ( *char2 ).unicode() != 0x57 && ( *char2 ).unicode() != 0x59 ) ) )
326  {
327  *char1 = *char2;
328  char1++;
329  outLen++;
330  }
331  }
332  tmp.truncate( outLen );
333 
334  QChar *tmpChar = tmp.data();
335  tmpChar++;
336  for ( int i = 1; i < tmp.length(); ++i, ++tmpChar )
337  {
338  switch ( ( *tmpChar ).unicode() )
339  {
340  case 0x42:
341  case 0x46:
342  case 0x50:
343  case 0x56:
344  tmp.replace( i, 1, QChar( 0x31 ) );
345  break;
346 
347  case 0x43:
348  case 0x47:
349  case 0x4A:
350  case 0x4B:
351  case 0x51:
352  case 0x53:
353  case 0x58:
354  case 0x5A:
355  tmp.replace( i, 1, QChar( 0x32 ) );
356  break;
357 
358  case 0x44:
359  case 0x54:
360  tmp.replace( i, 1, QChar( 0x33 ) );
361  break;
362 
363  case 0x4C:
364  tmp.replace( i, 1, QChar( 0x34 ) );
365  break;
366 
367  case 0x4D:
368  case 0x4E:
369  tmp.replace( i, 1, QChar( 0x35 ) );
370  break;
371 
372  case 0x52:
373  tmp.replace( i, 1, QChar( 0x36 ) );
374  break;
375  }
376  }
377 
378  //remove adjacent duplicates
379  char1 = tmp.data();
380  char2 = tmp.data();
381  char2++;
382  outLen = 1;
383  for ( int i = 1; i < tmp.length(); ++i, ++char2 )
384  {
385  if ( *char2 != *char1 )
386  {
387  char1++;
388  *char1 = *char2;
389  outLen++;
390  if ( outLen == 4 )
391  break;
392  }
393  }
394  tmp.truncate( outLen );
395  if ( tmp.length() < 4 )
396  {
397  tmp.append( "000" );
398  tmp.truncate( 4 );
399  }
400 
401  return tmp;
402 }
403 
404 QString QgsStringUtils::insertLinks( const QString &string, bool *foundLinks )
405 {
406  QString converted = string;
407 
408  // http://alanstorm.com/url_regex_explained
409  // note - there's more robust implementations available, but we need one which works within the limitation of QRegExp
410  static QRegExp urlRegEx( "(\\b(([\\w-]+://?|www[.])[^\\s()<>]+(?:\\([\\w\\d]+\\)|([^!\"#$%&'()*+,\\-./:;<=>?@[\\\\\\]^_`{|}~\\s]|/))))" );
411  static QRegExp protoRegEx( "^(?:f|ht)tps?://" );
412  static QRegExp emailRegEx( "([\\w._%+-]+@[\\w.-]+\\.[A-Za-z]+)" );
413 
414  int offset = 0;
415  bool found = false;
416  while ( urlRegEx.indexIn( converted, offset ) != -1 )
417  {
418  found = true;
419  QString url = urlRegEx.cap( 1 );
420  QString protoUrl = url;
421  if ( protoRegEx.indexIn( protoUrl ) == -1 )
422  {
423  protoUrl.prepend( "http://" );
424  }
425  QString anchor = QStringLiteral( "<a href=\"%1\">%2</a>" ).arg( protoUrl.toHtmlEscaped(), url.toHtmlEscaped() );
426  converted.replace( urlRegEx.pos( 1 ), url.length(), anchor );
427  offset = urlRegEx.pos( 1 ) + anchor.length();
428  }
429  offset = 0;
430  while ( emailRegEx.indexIn( converted, offset ) != -1 )
431  {
432  found = true;
433  QString email = emailRegEx.cap( 1 );
434  QString anchor = QStringLiteral( "<a href=\"mailto:%1\">%1</a>" ).arg( email.toHtmlEscaped(), email.toHtmlEscaped() );
435  converted.replace( emailRegEx.pos( 1 ), email.length(), anchor );
436  offset = emailRegEx.pos( 1 ) + anchor.length();
437  }
438 
439  if ( foundLinks )
440  *foundLinks = found;
441 
442  return converted;
443 }
444 
445 QgsStringReplacement::QgsStringReplacement( const QString &match, const QString &replacement, bool caseSensitive, bool wholeWordOnly )
446  : mMatch( match )
447  , mReplacement( replacement )
448  , mCaseSensitive( caseSensitive )
449  , mWholeWordOnly( wholeWordOnly )
450 {
451  if ( mWholeWordOnly )
452  mRx = QRegExp( QString( "\\b%1\\b" ).arg( mMatch ),
453  mCaseSensitive ? Qt::CaseSensitive : Qt::CaseInsensitive );
454 }
455 
456 QString QgsStringReplacement::process( const QString &input ) const
457 {
458  QString result = input;
459  if ( !mWholeWordOnly )
460  {
461  return result.replace( mMatch, mReplacement, mCaseSensitive ? Qt::CaseSensitive : Qt::CaseInsensitive );
462  }
463  else
464  {
465  return result.replace( mRx, mReplacement );
466  }
467 }
468 
470 {
471  QgsStringMap map;
472  map.insert( QStringLiteral( "match" ), mMatch );
473  map.insert( QStringLiteral( "replace" ), mReplacement );
474  map.insert( QStringLiteral( "caseSensitive" ), mCaseSensitive ? "1" : "0" );
475  map.insert( QStringLiteral( "wholeWord" ), mWholeWordOnly ? "1" : "0" );
476  return map;
477 }
478 
480 {
481  return QgsStringReplacement( properties.value( QStringLiteral( "match" ) ),
482  properties.value( QStringLiteral( "replace" ) ),
483  properties.value( QStringLiteral( "caseSensitive" ), QStringLiteral( "0" ) ) == QLatin1String( "1" ),
484  properties.value( QStringLiteral( "wholeWord" ), QStringLiteral( "0" ) ) == QLatin1String( "1" ) );
485 }
486 
487 QString QgsStringReplacementCollection::process( const QString &input ) const
488 {
489  QString result = input;
490  Q_FOREACH ( const QgsStringReplacement &r, mReplacements )
491  {
492  result = r.process( result );
493  }
494  return result;
495 }
496 
497 void QgsStringReplacementCollection::writeXml( QDomElement &elem, QDomDocument &doc ) const
498 {
499  Q_FOREACH ( const QgsStringReplacement &r, mReplacements )
500  {
501  QgsStringMap props = r.properties();
502  QDomElement propEl = doc.createElement( QStringLiteral( "replacement" ) );
503  QgsStringMap::const_iterator it = props.constBegin();
504  for ( ; it != props.constEnd(); ++it )
505  {
506  propEl.setAttribute( it.key(), it.value() );
507  }
508  elem.appendChild( propEl );
509  }
510 }
511 
512 void QgsStringReplacementCollection::readXml( const QDomElement &elem )
513 {
514  mReplacements.clear();
515  QDomNodeList nodelist = elem.elementsByTagName( QStringLiteral( "replacement" ) );
516  for ( int i = 0; i < nodelist.count(); i++ )
517  {
518  QDomElement replacementElem = nodelist.at( i ).toElement();
519  QDomNamedNodeMap nodeMap = replacementElem.attributes();
520 
521  QgsStringMap props;
522  for ( int j = 0; j < nodeMap.count(); ++j )
523  {
524  props.insert( nodeMap.item( j ).nodeName(), nodeMap.item( j ).nodeValue() );
525  }
526  mReplacements << QgsStringReplacement::fromProperties( props );
527  }
528 
529 }
static QString longestCommonSubstring(const QString &string1, const QString &string2, bool caseSensitive=false)
Returns the longest common substring between two strings.
A representation of a single string replacement.
Simple title case conversion - does not fully grammatically parse the text and uses simple rules only...
QMap< QString, QString > QgsStringMap
Definition: qgis.h:501
static QString soundex(const QString &string)
Returns the Soundex representation of a string.
void writeXml(QDomElement &elem, QDomDocument &doc) const
Writes the collection state to an XML element.
static QgsStringReplacement fromProperties(const QgsStringMap &properties)
Creates a new QgsStringReplacement from an encoded properties map.
static QString capitalize(const QString &string, Capitalization capitalization)
Converts a string by applying capitalization rules to the string.
static int levenshteinDistance(const QString &string1, const QString &string2, bool caseSensitive=false)
Returns the Levenshtein edit distance between two strings.
Convert just the first letter of each word to uppercase, leave the rest untouched.
Convert all characters to uppercase.
Capitalization
Capitalization options.
QgsStringMap properties() const
Returns a map of the replacement properties.
static QString ampersandEncode(const QString &string)
Makes a raw string safe for inclusion as a HTML/XML string literal.
QString process(const QString &input) const
Processes a given input string, applying any valid replacements which should be made.
Mixed case, ie no change.
Convert all characters to lowercase.
void readXml(const QDomElement &elem)
Reads the collection state from an XML element.
QString process(const QString &input) const
Processes a given input string, applying any valid replacements which should be made using QgsStringR...
QgsStringReplacement(const QString &match, const QString &replacement, bool caseSensitive=false, bool wholeWordOnly=false)
Constructor for QgsStringReplacement.
static QString insertLinks(const QString &string, bool *foundLinks=nullptr)
Returns a string with any URL (e.g., http(s)/ftp) and mailto: text converted to valid HTML <a ...
static int hammingDistance(const QString &string1, const QString &string2, bool caseSensitive=false)
Returns the Hamming distance between two strings.