Upsilon/ion/src/shared/unicode/utf8_helper.cpp

#include <ion/unicode/utf8_helper.h>
#include <ion/unicode/utf8_decoder.h>
#include <kandinsky/font.h>
#include <string.h>
#include <assert.h>
#include <algorithm>

namespace UTF8Helper {

int CountOccurrences(const char * s, CodePoint c) {
  assert(c != UCodePointNull);
  int count = 0;
  if (UTF8Decoder::CharSizeOfCodePoint(c) == 1) {
    /* The code point is one char long, so it is equal to its char translation.
     * We can do a classic char search. */
    const char * i = s;
    while (*i != 0) {
      if (*i == c) {
        count++;
      }
      i++;
    }
  } else {
    // The code point is more than one char long, we use a UTF8Decoder.
    UTF8Decoder decoder(s);
    CodePoint codePoint = decoder.nextCodePoint();
    while (codePoint != UCodePointNull) {
      if (codePoint == c) {
        count++;
      }
      codePoint = decoder.nextCodePoint();
    }
  }
  return count;
}

const char * CodePointSearch(const char * s, CodePoint c, const char * stoppingPosition) {
  if (UTF8Decoder::CharSizeOfCodePoint(c) == 1) {
    const char * result = s;
    while (*result != 0 && *result != c && (stoppingPosition == nullptr || result != stoppingPosition)) {
      result++;
    }
    return result;
  }
  UTF8Decoder decoder(s);
  const char * currentPointer = s;
  CodePoint codePoint = decoder.nextCodePoint();
  const char * nextPointer = decoder.stringPosition();
  while (codePoint != UCodePointNull && codePoint != c && (stoppingPosition == nullptr || currentPointer < stoppingPosition)) {
    currentPointer = nextPointer;
    codePoint = decoder.nextCodePoint();
    nextPointer = decoder.stringPosition();
  }
  return currentPointer;
}

bool HasCodePoint(const char * s, CodePoint c, const char * stoppingPosition) {
  assert(c != 0);
  const char * resultPosition = CodePointSearch(s, c, stoppingPosition);
  return *resultPosition != 0 && (stoppingPosition == nullptr || resultPosition < stoppingPosition);
}

const char * NotCodePointSearch(const char * s, CodePoint c, bool goingLeft, const char * initialPosition) {
  if (goingLeft && initialPosition == s) {
    return s;
  }
  assert(goingLeft || initialPosition == nullptr);
  assert(!goingLeft || initialPosition != nullptr);
  if (UTF8Decoder::CharSizeOfCodePoint(c) == 1) {
    /* The code points are one char long, so they are equal to their char
     * translations. We can do a classic char search. */
    const char * codePointPointer = goingLeft ? initialPosition - 1 : s;
    while ((goingLeft ?codePointPointer > s : *codePointPointer != 0) && *codePointPointer == c) {
      codePointPointer += goingLeft ? -1 : 1;
    }
    return codePointPointer;
  }
  if (goingLeft) {
    UTF8Decoder decoder(s, initialPosition);
    CodePoint codePoint = decoder.previousCodePoint();
    const char * codePointPointer = decoder.stringPosition();
    while (codePointPointer > s && codePoint == c) {
      codePoint = decoder.previousCodePoint();
      codePointPointer = decoder.stringPosition();
    }
    return codePointPointer;
  }
  UTF8Decoder decoder(s);
  const char * codePointPointer = decoder.stringPosition();
  CodePoint codePoint = decoder.nextCodePoint();
  while (codePoint != UCodePointNull && codePoint == c) {
    codePointPointer = decoder.stringPosition();
    codePoint = decoder.nextCodePoint();
  }
  return codePointPointer;
}

bool CopyAndRemoveCodePoints(char * dst, size_t dstSize, const char * src, CodePoint * codePoints, int numberOfCodePoints) {
  UTF8Decoder decoder(src);
  CodePoint codePoint = decoder.nextCodePoint();
  if (dstSize <= 0) {
    return codePoint == UCodePointNull;
  }
  assert(numberOfCodePoints >= 1);
  const char * currentPointer = src;
  const char * nextPointer = decoder.stringPosition();
  size_t bufferIndex = 0;

  // Remove CodePoint c
  while (codePoint != UCodePointNull && bufferIndex < dstSize) {
    bool remove = false;
    for (int i = 0; i < numberOfCodePoints; i++) {
      if (codePoint == codePoints[i]) {
        remove = true;
        break;
      }
    }
    if (!remove) {
      size_t copySize = nextPointer - currentPointer;
      if (copySize > dstSize - 1 - bufferIndex) {
        // Copying the current code point to the buffer would overflow the buffer
        break;
      }
      memcpy(dst + bufferIndex, currentPointer, copySize);
      bufferIndex+= copySize;
    }
    currentPointer = nextPointer;
    codePoint = decoder.nextCodePoint();
    nextPointer = decoder.stringPosition();
  }
  *(dst + bufferIndex) = 0;
  return codePoint == UCodePointNull;
}

void RemoveCodePoint(char * buffer, CodePoint c, const char * * pointerToUpdate, const char * stoppingPosition) {
  constexpr int patternMaxSize = CodePoint::MaxCodePointCharLength + 1; // +1 for null terminating char
  char pattern[patternMaxSize];
  int codePointCharSize = UTF8Decoder::CharSizeOfCodePoint(c);
  UTF8Decoder::CodePointToChars(c, pattern, codePointCharSize);
  pattern[codePointCharSize] = '\0';
  TextPair pair(pattern, "");
  TryAndReplacePatternsInStringByPatterns(buffer, strlen(buffer), &pair, 1, true, pointerToUpdate, stoppingPosition);
}

bool SlideStringByNumberOfChar(char * text, int slidingSize, size_t textMaxLength) {
  size_t lenText = strlen(text);
  if (lenText + slidingSize > textMaxLength || lenText + slidingSize < 0) {
    return false;
  }
  if (slidingSize > 0) {
    memmove(text+slidingSize, text, strlen(text)+1);
  } else if (slidingSize < 0) {
    memmove(text, text-slidingSize, strlen(text)+1);
  }
  // In case slidingSize = 0, there is nothing to do
  return true;
}

/* Replaces the first chars of a string by other ones. If the sizes are different
 * the rest of the string will be moved right after the replacement chars.
 * If successful returns true.*/
static bool replaceFirstCharsByPattern(char * text, size_t lengthOfPatternToRemove, const char * replacementPattern, size_t textMaxLength) {
  size_t lengthOfReplacementPattern = strlen(replacementPattern);
  if (lengthOfPatternToRemove <= strlen(text) && SlideStringByNumberOfChar(text, lengthOfReplacementPattern-lengthOfPatternToRemove, textMaxLength)) {
    for (size_t i = 0; i < lengthOfReplacementPattern; i++) {
      text[i] = replacementPattern[i];
    }
    return true;
  }
  return false;
}

void TryAndReplacePatternsInStringByPatterns(char * text, int textMaxLength, TextPair * textPairs, int numberOfPairs, bool firstToSecond, const char * * pointerToUpdate, const char * stoppingPosition) {
  size_t i = 0;
  size_t iPrev = 0;
  size_t textLength = strlen(text);
  size_t lengthOfParenthesisExtention = strlen("(\x11)");
  while(i < textLength) {
    iPrev = i;
    bool didReplace = false;
    for (int j = 0; j < numberOfPairs; j++) {
      TextPair p = textPairs[j];
      size_t firstStringLength = strlen(p.firstString());
      size_t secondStringLength = strlen(p.secondString());
      /* Instead of storing TextPair("√(\x11)", "sqrt(\x11)") for the keyboard
       * events and TextPair("√", "sqrt") for the copy paste, we store just the
       * first and register it as "function". Therefore we can decide to remove
       * the (\x11) part or not depending on the application. This process is
       * repeated for all 4 function keys usable in python (√, ℯ, ln, log)*/
      if (p.removeParenthesesExtention()) {
        firstStringLength -= lengthOfParenthesisExtention;
        secondStringLength -= lengthOfParenthesisExtention;
      }
      char firstString[TextPair::k_maxLength];
      char secondString[TextPair::k_maxLength];
      // Getting rid of the eventual (\x11) part
      strlcpy((char *)firstString, p.firstString(), firstStringLength+1);
      strlcpy((char *)secondString, p.secondString(), secondStringLength+1);

      char * matchedString = firstToSecond ? firstString : secondString;
      size_t matchedStringLength = strlen(matchedString);
      char * replacingString = firstToSecond ? secondString : firstString;
      size_t replacingStringLength = strlen(replacingString);

      if (strncmp(&text[i], matchedString, matchedStringLength) == 0) {
        didReplace = replaceFirstCharsByPattern(&text[i], matchedStringLength, replacingString, textMaxLength);
        if (didReplace) {
          int delta = replacingStringLength - matchedStringLength;
          textLength += delta;
          if (pointerToUpdate != nullptr && &text[i] < *pointerToUpdate) {
            // We still have to update the pointer as the modification cursor has not yet exceeded it.
            *pointerToUpdate = *pointerToUpdate + delta;
          }
          if (stoppingPosition != nullptr) {
            stoppingPosition = stoppingPosition + delta;
          }
          if (replacingStringLength != 0) {
            i += replacingStringLength - 1;
            /* When working with multiple TextPairs at the same time, it can be
             * usefull to go back by one char. That is the case for empty matrixes
             * Indeed, in the string ",,]",  ",," is replaced by ",\x11,".
             * The ",]" pattern right after would be missed if not for the -1.*/
          }
        }
      }
    }
    if (iPrev == i && !didReplace) {
      // In case no pattern matched with the text, we go to the next char.
      i++;
    }
    if ((stoppingPosition != nullptr) && (&text[i] >= stoppingPosition)) {
      break;
    }
  }
}

size_t CopyUntilCodePoint(char * dst, size_t dstSize, const char * src, CodePoint c) {
  UTF8Decoder decoder(src);
  const char * codePointPointer = decoder.stringPosition();
  CodePoint codePoint = decoder.nextCodePoint();
  while (codePoint != UCodePointNull && codePoint != c) {
    codePointPointer = decoder.stringPosition();
    codePoint = decoder.nextCodePoint();
  }
  assert(codePointPointer >= src);
  size_t copySize = std::min(dstSize - 1, static_cast<size_t>(codePointPointer - src));
  assert(UTF8Helper::CodePointIs(src + copySize, 0) || UTF8Helper::CodePointIs(src + copySize, c));
  memmove(dst, src, copySize);
  assert(copySize < dstSize);
  dst[copySize] = 0;
  return copySize;
}

const char * PerformAtCodePoints(const char * s, CodePoint c, CodePointAction actionCodePoint, CodePointAction actionOtherCodePoint, void * contextPointer, int contextInt1, int contextInt2, CodePoint stoppingCodePoint, bool goingRight, const char * initialPosition, const char * stoppingPosition) {
  /* If we are decoding towards the left, we must have a starting position. If
   * we are decoding towards the right, the starting position is the start of
   * string. */
  assert((goingRight && initialPosition == nullptr)
      || (!goingRight && initialPosition != nullptr));

  if (UTF8Decoder::CharSizeOfCodePoint(c) == 1 && UTF8Decoder::CharSizeOfCodePoint(stoppingCodePoint) == 1) {
    /* The code points are one char long, so they are equal to their char
     * translations. We can do a classic char search. */
    if (goingRight) {
      const char * i = s;
      while (*i != stoppingCodePoint && *i != 0 && i != stoppingPosition) {
        if (*i == c) {
          actionCodePoint(i - s, contextPointer, contextInt1, contextInt2);
        } else {
          // FIXME we are stopping at every char, not every code point -> it does not make any bug for now
          actionOtherCodePoint(i - s, contextPointer, contextInt1, contextInt2);
        }
        i++;
      }
      return i;
    }
    const char * i = initialPosition - 1;
    while (i >= s && *i != stoppingCodePoint && i != stoppingPosition) {
      if (*i == c) {
        actionCodePoint(i - s, contextPointer, contextInt1, contextInt2);
      } else {
        actionOtherCodePoint(i - s, contextPointer, contextInt1, contextInt2);
      }
      i--;
    }
    return i;
  }
  // The code point is more than one char long, we use a UTF8Decoder.
  if (goingRight) {
    UTF8Decoder decoder(s);
    const char * codePointPointer = decoder.stringPosition();
    CodePoint codePoint = decoder.nextCodePoint();
    while (codePoint != stoppingCodePoint && codePoint != UCodePointNull && codePointPointer != stoppingPosition) {
      if (codePoint == c) {
        actionCodePoint(codePointPointer - s, contextPointer, contextInt1, contextInt2);
      } else {
        actionOtherCodePoint(codePointPointer - s, contextPointer, contextInt1, contextInt2);
      }
      codePointPointer = decoder.stringPosition();
      codePoint = decoder.nextCodePoint();
    }
    return codePointPointer;
  }
  assert(!goingRight);
  if (initialPosition <= s) {
    return initialPosition;
  }
  UTF8Decoder decoder(s, initialPosition);
  CodePoint codePoint = decoder.previousCodePoint();
  const char * codePointPointer = decoder.stringPosition();
  while (codePointPointer >= s && codePoint != stoppingCodePoint && codePointPointer != stoppingPosition) {
    if (codePoint == c) {
      actionCodePoint(codePointPointer - s, contextPointer, contextInt1, contextInt2);
    } else {
      actionOtherCodePoint(codePointPointer - s, contextPointer, contextInt1, contextInt2);
    }
    if (codePointPointer > s) {
      codePoint = decoder.previousCodePoint();
      codePointPointer = decoder.stringPosition();
    } else {
      /* If the current pointer is s, we cannot continue decoding. Decreasing s
       * will stop the while loop. */
      codePointPointer = s-1;
    }
  }
  return codePointPointer;
}

CodePoint PreviousCodePoint(const char * buffer, const char * location) {
  if (location == buffer) {
    return UCodePointNull;
  }
  UTF8Decoder decoder(buffer, location);
  return decoder.previousCodePoint();
}

CodePoint CodePointAtLocation(const char * location) {
  UTF8Decoder decoder(location);
  return decoder.nextCodePoint();
}

bool PreviousCodePointIs(const char * buffer, const char * location, CodePoint c) {
  assert(location > buffer);
  if (UTF8Decoder::CharSizeOfCodePoint(c) == 1) {
    return *(location -1) == c;
  }
  return PreviousCodePoint(buffer, location) == c;
}

bool CodePointIs(const char * location, CodePoint c) {
  if (UTF8Decoder::CharSizeOfCodePoint(c) == 1) {
    return *(location) == c;
  }
  return CodePointAtLocation(location) == c;
}

bool CodePointIsEndOfWord(CodePoint c) {
  return c == '\n' || c == ' ' || c == UCodePointNull;
}

int RemovePreviousGlyph(const char * text, char * location, CodePoint * c) {
  if (location <= text) {
    assert(location == text);
    return 0;
  }

  // Find the previous glyph
  UTF8Decoder decoder(text, location);
  const char * previousGlyphPos = decoder.previousGlyphPosition();
  if (c != nullptr) {
    *c = decoder.nextCodePoint();
  }

  // Shift the buffer
  int shiftedSize = location - previousGlyphPos;
  char * iterator = const_cast<char *>(previousGlyphPos);
  assert(iterator >= text);
  do {
    *iterator = *(iterator + shiftedSize);
    iterator++;
  } while (*(iterator - 1) != 0); // Stop shifting after writing a null terminating char.

  return shiftedSize;
}

const char * CodePointAtGlyphOffset(const char * buffer, int position) {
  assert(buffer != nullptr);
  if (position < 0) {
    return buffer;
  }

  UTF8Decoder decoder(buffer);
  const char * codePointPointer = decoder.stringPosition();
  CodePoint codePoint = decoder.nextCodePoint();
  int glyphIndex = 0;
  while (codePoint != UCodePointNull) {
    if (glyphIndex == position) {
      assert(!codePoint.isCombining());
      return codePointPointer;
    }
    if (!codePoint.isCombining()) {
      glyphIndex++;
    }
    codePointPointer = decoder.stringPosition();
    codePoint = decoder.nextCodePoint();
  }
  return codePointPointer;
}

size_t GlyphOffsetAtCodePoint(const char * buffer, const char * position) {
  assert(position >= buffer);

  UTF8Decoder decoder(buffer);
  const char * codePointPointer = decoder.stringPosition();
  CodePoint codePoint = decoder.nextCodePoint();
  size_t glyphIndex = 0;
  while (codePoint != UCodePointNull) {
    if (codePointPointer == position) {
      assert(!codePoint.isCombining());
      return glyphIndex;
    }
    if (!codePoint.isCombining()) {
      glyphIndex++;
    }
    codePointPointer = decoder.stringPosition();
    codePoint = decoder.nextCodePoint();
  }
  return glyphIndex;
}

size_t StringGlyphLength(const char * s, int maxSize) {
  if (maxSize == 0) {
    return 0;
  }
  UTF8Decoder decoder(s);
  CodePoint codePoint = 0;
  size_t glyphIndex = 0;
  while (maxSize < 0 || ((decoder.stringPosition() - s) < maxSize)) {
    codePoint = decoder.nextCodePoint();
    if (codePoint == UCodePointNull) {
      break;
    }
    if (!codePoint.isCombining()) {
      glyphIndex++;
    }
  }
  return glyphIndex;
}

const char * BeginningOfWord(const char * text, const char * word) {
  if (text == word) {
    return text;
  }
  UTF8Decoder decoder(text, word);
  const char * codePointPointer = decoder.stringPosition();
  CodePoint codePoint = decoder.previousCodePoint();
  while (!CodePointIsEndOfWord(codePoint)) {
    codePointPointer = decoder.stringPosition();
    if (codePointPointer == text) {
      break;
    }
    codePoint = decoder.previousCodePoint();
  }
  return codePointPointer;
}

const char * EndOfWord(const char * word) {
  UTF8Decoder decoder(word);
  CodePoint codePoint = decoder.nextCodePoint();
  const char * result = word;
  while (!CodePointIsEndOfWord(codePoint)) {
    result = decoder.stringPosition();
    codePoint = decoder.nextCodePoint();
  }
  return result;
}

void countGlyphsInLine(const char * text, int * before, int * after, const char * beforeLocation, const char *afterLocation) {
  UTF8Helper::CodePointAction countGlyph = [](int, void * glyphCount, int, int) {
    int * castedCount = (int *) glyphCount;
    *castedCount = *castedCount + 1;
  };
  // Count glyphs before
  UTF8Helper::PerformAtCodePoints(text, UCodePointLineFeed, nullptr, countGlyph, before, 0, 0, UCodePointLineFeed, false, beforeLocation);
  if (afterLocation == nullptr) {
    afterLocation = beforeLocation;
  }
  // Count glyphs after
  UTF8Helper::PerformAtCodePoints(afterLocation, UCodePointLineFeed, nullptr, countGlyph, after, 0, 0, UCodePointLineFeed);
}

}