From 11df804e857c8e21d28dd142842d473e7ee77c32 Mon Sep 17 00:00:00 2001
From: dgelessus <dgelessus@users.noreply.github.com>
Date: Fri, 10 Nov 2023 14:06:47 +0100
Subject: [PATCH] Fix minor Unicode newline handling bug and comment the logic

---
 .../java/org/sablecc/sablecc/lexer/Lexer.java | 22 +++++++++++++++----
 .../resources/org/sablecc/sablecc/lexer.txt   | 22 +++++++++++++++----
 2 files changed, 36 insertions(+), 8 deletions(-)

diff --git a/src/main/java/org/sablecc/sablecc/lexer/Lexer.java b/src/main/java/org/sablecc/sablecc/lexer/Lexer.java
index 9449cde..30d4aa0 100644
--- a/src/main/java/org/sablecc/sablecc/lexer/Lexer.java
+++ b/src/main/java/org/sablecc/sablecc/lexer/Lexer.java
@@ -79,24 +79,38 @@ public class Lexer
             {
                 switch(c)
                 {
-                    case 10:
+                    case '\n':
                         if(this.cr)
                         {
+                            // If the preceding character was \r (CR),
+                            // ignore this \n (LF) character and don't increase the line or column.
                             this.cr = false;
                         }
                         else
                         {
+                            // If there was no preceding \r (CR) character,
+                            // consider this \n (LF) character an actual newline.
                             this.line++;
                             this.pos = 0;
                         }
                         break;
-                    case 13:
-                    case 8232: // Unicode line separator
-                    case 8233: // Unicode paragraph separator
+                    case '\r':
+                        // A \r (CR) character is always considered a newline,
+                        // but a \n (LF) character following it (if any) will be ignored (see above).
                         this.line++;
                         this.pos = 0;
                         this.cr = true;
                         break;
+                    case 0x2028: // Unicode line separator
+                    case 0x2029: // Unicode paragraph separator
+                        // A Unicode line or paragraph separator is treated like a newline,
+                        // but doesn't take part in the special handling for CR+LF.
+                        // FIXME This case is a workaround for a limitation in the ProB cliparser prepl protocol, which doesn't support embedded newlines.
+                        // TODO Remove this case once that is resolved. Practically nothing else uses the Unicode line separator character.
+                        this.line++;
+                        this.pos = 0;
+                        this.cr = false;
+                        break;
                     default:
                         this.pos++;
                         this.cr = false;
diff --git a/src/main/resources/org/sablecc/sablecc/lexer.txt b/src/main/resources/org/sablecc/sablecc/lexer.txt
index 60f697e..506e87f 100644
--- a/src/main/resources/org/sablecc/sablecc/lexer.txt
+++ b/src/main/resources/org/sablecc/sablecc/lexer.txt
@@ -103,24 +103,38 @@ public class Lexer
             {
                 switch(c)
                 {
-                    case 10:
+                    case '\n':
                         if(this.cr)
                         {
+                            // If the preceding character was \r (CR),
+                            // ignore this \n (LF) character and don't increase the line or column.
                             this.cr = false;
                         }
                         else
                         {
+                            // If there was no preceding \r (CR) character,
+                            // consider this \n (LF) character an actual newline.
                             this.line++;
                             this.pos = 0;
                         }
                         break;
-                    case 13:
-                    case 8232: // Unicode line separator
-                    case 8233: // Unicode paragraph separator
+                    case '\r':
+                        // A \r (CR) character is always considered a newline,
+                        // but a \n (LF) character following it (if any) will be ignored (see above).
                         this.line++;
                         this.pos = 0;
                         this.cr = true;
                         break;
+                    case 0x2028: // Unicode line separator
+                    case 0x2029: // Unicode paragraph separator
+                        // A Unicode line or paragraph separator is treated like a newline,
+                        // but doesn't take part in the special handling for CR+LF.
+                        // FIXME This case is a workaround for a limitation in the ProB cliparser prepl protocol, which doesn't support embedded newlines.
+                        // TODO Remove this case once that is resolved. Practically nothing else uses the Unicode line separator character.
+                        this.line++;
+                        this.pos = 0;
+                        this.cr = false;
+                        break;
                     default:
                         this.pos++;
                         this.cr = false;
-- 
GitLab