llvm-project/clang/lib/Format/BreakableToken.cpp

//===--- BreakableToken.cpp - Format C++ code -----------------------------===//
//
//                     The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
///
/// \file
/// \brief Contains implementation of BreakableToken class and classes derived
/// from it.
///
//===----------------------------------------------------------------------===//

#define DEBUG_TYPE "format-token-breaker"

#include "BreakableToken.h"
#include "clang/Format/Format.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/Support/Debug.h"
#include <algorithm>

namespace clang {
namespace format {
namespace {

// FIXME: Move helper string functions to where it makes sense.

unsigned getOctalLength(StringRef Text) {
  unsigned I = 1;
  while (I < Text.size() && I < 4 && (Text[I] >= '0' && Text[I] <= '7')) {
    ++I;
  }
  return I;
}

unsigned getHexLength(StringRef Text) {
  unsigned I = 2; // Point after '\x'.
  while (I < Text.size() && ((Text[I] >= '0' && Text[I] <= '9') ||
                             (Text[I] >= 'a' && Text[I] <= 'f') ||
                             (Text[I] >= 'A' && Text[I] <= 'F'))) {
    ++I;
  }
  return I;
}

unsigned getEscapeSequenceLength(StringRef Text) {
  assert(Text[0] == '\\');
  if (Text.size() < 2)
    return 1;

  switch (Text[1]) {
  case 'u':
    return 6;
  case 'U':
    return 10;
  case 'x':
    return getHexLength(Text);
  default:
    if (Text[1] >= '0' && Text[1] <= '7')
      return getOctalLength(Text);
    return 2;
  }
}

StringRef::size_type getStartOfCharacter(StringRef Text,
                                         StringRef::size_type Offset) {
  StringRef::size_type NextEscape = Text.find('\\');
  while (NextEscape != StringRef::npos && NextEscape < Offset) {
    StringRef::size_type SequenceLength =
        getEscapeSequenceLength(Text.substr(NextEscape));
    if (Offset < NextEscape + SequenceLength)
      return NextEscape;
    NextEscape = Text.find('\\', NextEscape + SequenceLength);
  }
  return Offset;
}

BreakableToken::Split getCommentSplit(StringRef Text,
                                      unsigned ContentStartColumn,
                                      unsigned ColumnLimit) {
  if (ColumnLimit <= ContentStartColumn + 1)
    return BreakableToken::Split(StringRef::npos, 0);

  unsigned MaxSplit = ColumnLimit - ContentStartColumn + 1;
  StringRef::size_type SpaceOffset = Text.rfind(' ', MaxSplit);
  if (SpaceOffset == StringRef::npos ||
      // Don't break at leading whitespace.
      Text.find_last_not_of(' ', SpaceOffset) == StringRef::npos) {
    // Make sure that we don't break at leading whitespace that
    // reaches past MaxSplit.
    StringRef::size_type FirstNonWhitespace = Text.find_first_not_of(" ");
    if (FirstNonWhitespace == StringRef::npos)
      // If the comment is only whitespace, we cannot split.
      return BreakableToken::Split(StringRef::npos, 0);
    SpaceOffset =
        Text.find(' ', std::max<unsigned>(MaxSplit, FirstNonWhitespace));
  }
  if (SpaceOffset != StringRef::npos && SpaceOffset != 0) {
    StringRef BeforeCut = Text.substr(0, SpaceOffset).rtrim();
    StringRef AfterCut = Text.substr(SpaceOffset).ltrim();
    return BreakableToken::Split(BeforeCut.size(),
                                 AfterCut.begin() - BeforeCut.end());
  }
  return BreakableToken::Split(StringRef::npos, 0);
}

BreakableToken::Split getStringSplit(StringRef Text,
                                     unsigned ContentStartColumn,
                                     unsigned ColumnLimit) {

  if (ColumnLimit <= ContentStartColumn)
    return BreakableToken::Split(StringRef::npos, 0);
  unsigned MaxSplit = ColumnLimit - ContentStartColumn;
  // FIXME: Reduce unit test case.
  if (Text.empty())
    return BreakableToken::Split(StringRef::npos, 0);
  MaxSplit = std::min<unsigned>(MaxSplit, Text.size() - 1);
  StringRef::size_type SpaceOffset = Text.rfind(' ', MaxSplit);
  if (SpaceOffset != StringRef::npos && SpaceOffset != 0)
    return BreakableToken::Split(SpaceOffset + 1, 0);
  StringRef::size_type SlashOffset = Text.rfind('/', MaxSplit);
  if (SlashOffset != StringRef::npos && SlashOffset != 0)
    return BreakableToken::Split(SlashOffset + 1, 0);
  StringRef::size_type SplitPoint = getStartOfCharacter(Text, MaxSplit);
  if (SplitPoint == StringRef::npos || SplitPoint == 0)
    return BreakableToken::Split(StringRef::npos, 0);
  return BreakableToken::Split(SplitPoint, 0);
}

} // namespace

unsigned BreakableSingleLineToken::getLineCount() const { return 1; }

unsigned
BreakableSingleLineToken::getLineLengthAfterSplit(unsigned LineIndex,
                                                  unsigned TailOffset) const {
  return StartColumn + Prefix.size() + Postfix.size() + Line.size() -
         TailOffset;
}

void BreakableSingleLineToken::insertBreak(unsigned LineIndex,
                                           unsigned TailOffset, Split Split,
                                           bool InPPDirective,
                                           WhitespaceManager &Whitespaces) {
  Whitespaces.breakToken(Tok, Prefix.size() + TailOffset + Split.first,
                         Split.second, Postfix, Prefix, InPPDirective,
                         StartColumn);
}

BreakableSingleLineToken::BreakableSingleLineToken(const FormatToken &Tok,
                                                   unsigned StartColumn,
                                                   StringRef Prefix,
                                                   StringRef Postfix)
    : BreakableToken(Tok), StartColumn(StartColumn), Prefix(Prefix),
      Postfix(Postfix) {
  assert(Tok.TokenText.startswith(Prefix) && Tok.TokenText.endswith(Postfix));
  Line = Tok.TokenText.substr(
      Prefix.size(), Tok.TokenText.size() - Prefix.size() - Postfix.size());
}

BreakableStringLiteral::BreakableStringLiteral(const FormatToken &Tok,
                                               unsigned StartColumn)
    : BreakableSingleLineToken(Tok, StartColumn, "\"", "\"") {}

BreakableToken::Split
BreakableStringLiteral::getSplit(unsigned LineIndex, unsigned TailOffset,
                                 unsigned ColumnLimit) const {
  return getStringSplit(Line.substr(TailOffset), StartColumn + 2, ColumnLimit);
}

static StringRef getLineCommentPrefix(StringRef Comment) {
  const char *KnownPrefixes[] = { "/// ", "///", "// ", "//" };
  for (size_t i = 0, e = llvm::array_lengthof(KnownPrefixes); i != e; ++i)
    if (Comment.startswith(KnownPrefixes[i]))
      return KnownPrefixes[i];
  return "";
}

BreakableLineComment::BreakableLineComment(const FormatToken &Token,
                                           unsigned StartColumn)
    : BreakableSingleLineToken(Token, StartColumn,
                               getLineCommentPrefix(Token.TokenText), "") {}

BreakableToken::Split
BreakableLineComment::getSplit(unsigned LineIndex, unsigned TailOffset,
                               unsigned ColumnLimit) const {
  return getCommentSplit(Line.substr(TailOffset), StartColumn + Prefix.size(),
                         ColumnLimit);
}

BreakableBlockComment::BreakableBlockComment(const FormatStyle &Style,
                                             const FormatToken &Token,
                                             unsigned StartColumn,
                                             unsigned OriginalStartColumn,
                                             bool FirstInLine)
    : BreakableToken(Token) {
  StringRef TokenText(Token.TokenText);
  assert(TokenText.startswith("/*") && TokenText.endswith("*/"));
  TokenText.substr(2, TokenText.size() - 4).split(Lines, "\n");

  int IndentDelta = StartColumn - OriginalStartColumn;
  bool NeedsStar = true;
  LeadingWhitespace.resize(Lines.size());
  StartOfLineColumn.resize(Lines.size());
  if (Lines.size() == 1 && !FirstInLine) {
    // Comments for which FirstInLine is false can start on arbitrary column,
    // and available horizontal space can be too small to align consecutive
    // lines with the first one.
    // FIXME: We could, probably, align them to current indentation level, but
    // now we just wrap them without stars.
    NeedsStar = false;
  }
  StartOfLineColumn[0] = StartColumn + 2;
  for (size_t i = 1; i < Lines.size(); ++i) {
    adjustWhitespace(Style, i, IndentDelta);
    if (Lines[i].empty())
      // If the last line is empty, the closing "*/" will have a star.
      NeedsStar = NeedsStar && i + 1 == Lines.size();
    else
      NeedsStar = NeedsStar && Lines[i][0] == '*';
  }
  Decoration = NeedsStar ? "* " : "";
  IndentAtLineBreak = StartOfLineColumn[0] + 1;
  for (size_t i = 1; i < Lines.size(); ++i) {
    if (Lines[i].empty()) {
      if (!NeedsStar && i + 1 != Lines.size())
        // For all but the last line (which always ends in */), set the
        // start column to 0 if they're empty, so we do not insert
        // trailing whitespace anywhere.
        StartOfLineColumn[i] = 0;
      continue;
    }
    if (NeedsStar) {
      // The first line already excludes the star.
      // For all other lines, adjust the line to exclude the star and
      // (optionally) the first whitespace.
      int Offset = Lines[i].startswith("* ") ? 2 : 1;
      StartOfLineColumn[i] += Offset;
      Lines[i] = Lines[i].substr(Offset);
      LeadingWhitespace[i] += Offset;
    }
    // Exclude empty lines from the calculation of the left-most column.
    if (Lines[i].empty())
      continue;
    IndentAtLineBreak = std::min<int>(IndentAtLineBreak, StartOfLineColumn[i]);
  }
  DEBUG({
    for (size_t i = 0; i < Lines.size(); ++i) {
      llvm::dbgs() << i << " |" << Lines[i] << "| " << LeadingWhitespace[i]
                   << "\n";
    }
  });
}

void BreakableBlockComment::adjustWhitespace(const FormatStyle &Style,
                                             unsigned LineIndex,
                                             int IndentDelta) {
  // Calculate the end of the non-whitespace text in the previous line.
  size_t EndOfPreviousLine = Lines[LineIndex - 1].find_last_not_of(" \\\t");
  if (EndOfPreviousLine == StringRef::npos)
    EndOfPreviousLine = 0;
  else
    ++EndOfPreviousLine;
  // Calculate the start of the non-whitespace text in the current line.
  size_t StartOfLine = Lines[LineIndex].find_first_not_of(" \t");
  if (StartOfLine == StringRef::npos)
    StartOfLine = Lines[LineIndex].size();

  // Adjust Lines to only contain relevant text.
  Lines[LineIndex - 1] = Lines[LineIndex - 1].substr(0, EndOfPreviousLine);
  Lines[LineIndex] = Lines[LineIndex].substr(StartOfLine);
  // Adjust LeadingWhitespace to account all whitespace between the lines
  // to the current line.
  LeadingWhitespace[LineIndex] =
      Lines[LineIndex].begin() - Lines[LineIndex - 1].end();

  // FIXME: We currently count tabs as 1 character. To solve this, we need to
  // get the correct indentation width of the start of the comment, which
  // requires correct counting of the tab expansions before the comment, and
  // a configurable tab width. Since the current implementation only breaks
  // if leading tabs are intermixed with spaces, that is not a high priority.

  // Adjust the start column uniformly accross all lines.
  StartOfLineColumn[LineIndex] = std::max<int>(0, StartOfLine + IndentDelta);
}

unsigned BreakableBlockComment::getLineCount() const { return Lines.size(); }

unsigned
BreakableBlockComment::getLineLengthAfterSplit(unsigned LineIndex,
                                               unsigned TailOffset) const {
  return getContentStartColumn(LineIndex, TailOffset) +
         (Lines[LineIndex].size() - TailOffset) +
         // The last line gets a "*/" postfix.
         (LineIndex + 1 == Lines.size() ? 2 : 0);
}

BreakableToken::Split
BreakableBlockComment::getSplit(unsigned LineIndex, unsigned TailOffset,
                                unsigned ColumnLimit) const {
  return getCommentSplit(Lines[LineIndex].substr(TailOffset),
                         getContentStartColumn(LineIndex, TailOffset),
                         ColumnLimit);
}

void BreakableBlockComment::insertBreak(unsigned LineIndex, unsigned TailOffset,
                                        Split Split, bool InPPDirective,
                                        WhitespaceManager &Whitespaces) {
  StringRef Text = Lines[LineIndex].substr(TailOffset);
  StringRef Prefix = Decoration;
  if (LineIndex + 1 == Lines.size() &&
      Text.size() == Split.first + Split.second) {
    // For the last line we need to break before "*/", but not to add "* ".
    Prefix = "";
  }

  unsigned BreakOffsetInToken =
      Text.data() - Tok.TokenText.data() + Split.first;
  unsigned CharsToRemove = Split.second;
  assert(IndentAtLineBreak >= Decoration.size());
  Whitespaces.breakToken(Tok, BreakOffsetInToken, CharsToRemove, "", Prefix,
                         InPPDirective, IndentAtLineBreak - Decoration.size());
}

void
BreakableBlockComment::replaceWhitespaceBefore(unsigned LineIndex,
                                               unsigned InPPDirective,
                                               WhitespaceManager &Whitespaces) {
  if (LineIndex == 0)
    return;
  StringRef Prefix = Decoration;
  if (Lines[LineIndex].empty()) {
    if (LineIndex + 1 == Lines.size()) {
      // If the last line is empty, we don't need a prefix, as the */ will line
      // up with the decoration (if it exists).
      Prefix = "";
    } else if (!Decoration.empty()) {
      // For other empty lines, if we do have a decoration, adapt it to not
      // contain a trailing whitespace.
      Prefix = Prefix.substr(0, 1);
    }
  } else {
    if (StartOfLineColumn[LineIndex] == 1) {
      // This lines starts immediately after the decorating *.
      Prefix = Prefix.substr(0, 1);
    }
  }

  unsigned WhitespaceOffsetInToken =
      Lines[LineIndex].data() - Tok.TokenText.data() -
      LeadingWhitespace[LineIndex];
  assert(StartOfLineColumn[LineIndex] >= Prefix.size());
  Whitespaces.breakToken(
      Tok, WhitespaceOffsetInToken, LeadingWhitespace[LineIndex], "", Prefix,
      InPPDirective, StartOfLineColumn[LineIndex] - Prefix.size());
}

unsigned
BreakableBlockComment::getContentStartColumn(unsigned LineIndex,
                                             unsigned TailOffset) const {
  // If we break, we always break at the predefined indent.
  if (TailOffset != 0)
    return IndentAtLineBreak;
  return StartOfLineColumn[LineIndex];
}

} // namespace format
} // namespace clang