From 11df804e857c8e21d28dd142842d473e7ee77c32 Mon Sep 17 00:00:00 2001 From: dgelessus <dgelessus@users.noreply.github.com> Date: Fri, 10 Nov 2023 14:06:47 +0100 Subject: [PATCH] Fix minor Unicode newline handling bug and comment the logic --- .../java/org/sablecc/sablecc/lexer/Lexer.java | 22 +++++++++++++++---- .../resources/org/sablecc/sablecc/lexer.txt | 22 +++++++++++++++---- 2 files changed, 36 insertions(+), 8 deletions(-) diff --git a/src/main/java/org/sablecc/sablecc/lexer/Lexer.java b/src/main/java/org/sablecc/sablecc/lexer/Lexer.java index 9449cde..30d4aa0 100644 --- a/src/main/java/org/sablecc/sablecc/lexer/Lexer.java +++ b/src/main/java/org/sablecc/sablecc/lexer/Lexer.java @@ -79,24 +79,38 @@ public class Lexer { switch(c) { - case 10: + case '\n': if(this.cr) { + // If the preceding character was \r (CR), + // ignore this \n (LF) character and don't increase the line or column. this.cr = false; } else { + // If there was no preceding \r (CR) character, + // consider this \n (LF) character an actual newline. this.line++; this.pos = 0; } break; - case 13: - case 8232: // Unicode line separator - case 8233: // Unicode paragraph separator + case '\r': + // A \r (CR) character is always considered a newline, + // but a \n (LF) character following it (if any) will be ignored (see above). this.line++; this.pos = 0; this.cr = true; break; + case 0x2028: // Unicode line separator + case 0x2029: // Unicode paragraph separator + // A Unicode line or paragraph separator is treated like a newline, + // but doesn't take part in the special handling for CR+LF. + // FIXME This case is a workaround for a limitation in the ProB cliparser prepl protocol, which doesn't support embedded newlines. + // TODO Remove this case once that is resolved. Practically nothing else uses the Unicode line separator character. + this.line++; + this.pos = 0; + this.cr = false; + break; default: this.pos++; this.cr = false; diff --git a/src/main/resources/org/sablecc/sablecc/lexer.txt b/src/main/resources/org/sablecc/sablecc/lexer.txt index 60f697e..506e87f 100644 --- a/src/main/resources/org/sablecc/sablecc/lexer.txt +++ b/src/main/resources/org/sablecc/sablecc/lexer.txt @@ -103,24 +103,38 @@ public class Lexer { switch(c) { - case 10: + case '\n': if(this.cr) { + // If the preceding character was \r (CR), + // ignore this \n (LF) character and don't increase the line or column. this.cr = false; } else { + // If there was no preceding \r (CR) character, + // consider this \n (LF) character an actual newline. this.line++; this.pos = 0; } break; - case 13: - case 8232: // Unicode line separator - case 8233: // Unicode paragraph separator + case '\r': + // A \r (CR) character is always considered a newline, + // but a \n (LF) character following it (if any) will be ignored (see above). this.line++; this.pos = 0; this.cr = true; break; + case 0x2028: // Unicode line separator + case 0x2029: // Unicode paragraph separator + // A Unicode line or paragraph separator is treated like a newline, + // but doesn't take part in the special handling for CR+LF. + // FIXME This case is a workaround for a limitation in the ProB cliparser prepl protocol, which doesn't support embedded newlines. + // TODO Remove this case once that is resolved. Practically nothing else uses the Unicode line separator character. + this.line++; + this.pos = 0; + this.cr = false; + break; default: this.pos++; this.cr = false; -- GitLab