From f76c576f61029d843e8b3261fb197b34c0437d23 Mon Sep 17 00:00:00 2001
From: Markus Alexander Kuppe <tlaplus.net@lemmster.de>
Date: Thu, 21 May 2015 08:24:31 +0200
Subject: [PATCH] [Feature] Add TLCIterator#getLast(long) where long acts as a
 lower bound for the sequential iteration through the (potentially large)
 in-memory buffer.

Starting with the second disk flush, the maximum value (fingerprint) of
the in-memory and on-disk file has to be calculated. The maximum on-disk
value is kept as a variable, while the in-memory value requires a lookup
by iterating the buffer end to front. The moment the iterator sees a
value smaller than the on-disk max value, it can stop its scan. It won't
find a value larger than the on-disk one.
---
 tlatools/src/tlc2/tool/fp/MSBDiskFPSet.java   | 81 ++++++++++++++++++-
 .../test/tlc2/tool/fp/MSBDiskFPSetTest2.java  | 33 ++++++++
 .../tool/fp/iterator/TLCIterator2Test.java    | 32 ++++++++
 .../tool/fp/iterator/TLCIteratorTest.java     | 12 +++
 4 files changed, 155 insertions(+), 3 deletions(-)
 create mode 100644 tlatools/test/tlc2/tool/fp/iterator/TLCIterator2Test.java

diff --git a/tlatools/src/tlc2/tool/fp/MSBDiskFPSet.java b/tlatools/src/tlc2/tool/fp/MSBDiskFPSet.java
index 3d724c0dd..6b8a63a4e 100644
--- a/tlatools/src/tlc2/tool/fp/MSBDiskFPSet.java
+++ b/tlatools/src/tlc2/tool/fp/MSBDiskFPSet.java
@@ -100,10 +100,18 @@ public class MSBDiskFPSet extends HeapBasedDiskFPSet {
 			final long buffLen = tblCnt.get();
 			final TLCIterator itr = new TLCIterator(tbl);
 
-			// Precompute the maximum value of the new file
-			long maxVal = itr.getLast();
+			// Precompute the maximum value of the new file.
+			// If this isn't the first merge, the index has
+			// the last element of the disk file. In that case
+			// the new maxVal is the larger element of the two
+			// in-memory and on-disk elements.
+			// The largest on-disk element is passed to the
+			// iterator as a lower bound.
+			long maxVal;
 			if (index != null) {
-				maxVal = Math.max(maxVal, index[index.length - 1]);
+				maxVal = itr.getLast(index[index.length - 1]);
+			} else {
+				maxVal = itr.getLast();
 			}
 	
 			int indexLen = calculateIndexLen(buffLen);
@@ -141,6 +149,7 @@ public class MSBDiskFPSet extends HeapBasedDiskFPSet {
 					if (value == fp) {
 						Assert.check(false, EC.TLC_FP_VALUE_ALREADY_ON_DISK,
 								String.valueOf(value));
+						
 					}
 					writeFP(outRAF, fp);
 					// we used one fp up, thus move to next one
@@ -276,11 +285,77 @@ public class MSBDiskFPSet extends HeapBasedDiskFPSet {
 			return result;
 		}
 
+		/**
+		 * @param lowBound
+		 *            Stop searching for the last element if no element is
+		 *            larger than lowBound
+		 * @return The last element in the iteration that is larger than lowBound
+		 * @exception NoSuchElementException
+		 *                if iteration is empty.
+		 *                <p>
+		 *                Pre-condition: Each bucket is sorted in ascending
+		 *                order. on-disk fingerprints don't adhere to the
+		 *                ascending order though!
+		 */
+		public long getLast(final long lowBound) {
+			int len = buff.length - 1;
+			long[] bucket = buff[len];
+
+			// Walk from the end of buff to the beginning. Each bucket that is
+			// found and non-null (null if no fingerprint for such an index has
+			// been added to the DiskFPSet) is checked for a valid fingerprint.
+			// A fp is valid iff it is larger zero. A zero fingerprint slot
+			// indicates an unoccupied slot, while a negative one corresponds to
+			// a fp that has already been flushed to disk.
+			while (len > 0) {
+				bucket = buff[--len];
+
+				// Find last element > 0 in bucket (negative elements have already
+				// been flushed to disk, zero indicates an unoccupied slot).
+				if (bucket != null) {
+					for (int i = bucket.length - 1; i >= 0; i--) {
+						final long fp = bucket[i];
+						// unused slot
+						if (fp == 0) {
+							continue;
+						}
+						// in-memory fingerprint
+						if (fp > 0) {
+							if (fp < lowBound) {
+								// smaller lowBound
+								return lowBound;
+							} else {
+								// larger or equal lowBound
+								return fp;
+							}
+						}
+						// Cannot take on-disk fingerprints (negative ones) into
+						// account. They don't adhere to the precondition that
+						// the bucket is sorted. The bucket is logically sorted
+						// only for in-memory fingerprints.
+					}
+				}
+			}
+			// At this point we have scanned the complete buff, but haven't
+			// found a single fingerprint that hasn't been flushed to disk
+			// already.
+			throw new NoSuchElementException();
+		}
+
 		/**
 		 * @return The last element in the iteration.
 	     * @exception NoSuchElementException if iteration is empty.
+	     * 
+	     * Pre-condition: Each bucket is sorted in ascending order!
 		 */
 		public long getLast() {
+			/*
+			 * Tempting to delegate to getLast(Long.MAX_VALUE). However,
+			 * getLast() could not throw a NoSuchElementException when the
+			 * value returned is Long.MAX_VALUE. It could be a valid value, but
+			 * it might as well be the cutOff.
+			 */
+			
 			int len = buff.length - 1;
 			long[] bucket = buff[len];
 
diff --git a/tlatools/test/tlc2/tool/fp/MSBDiskFPSetTest2.java b/tlatools/test/tlc2/tool/fp/MSBDiskFPSetTest2.java
index 3a8b45495..ff080010a 100644
--- a/tlatools/test/tlc2/tool/fp/MSBDiskFPSetTest2.java
+++ b/tlatools/test/tlc2/tool/fp/MSBDiskFPSetTest2.java
@@ -77,6 +77,39 @@ public class MSBDiskFPSetTest2 extends AbstractHeapBasedDiskFPSetTest {
 		}
 		fail();
 	}
+	
+	public void testGetLastWithCutOff() throws IOException {
+		final MSBDiskFPSet msbDiskFPSet = getMSBDiskFPSet();
+		
+		// Add the largest possible fingerprint into the fpset. It will end up
+		// in the largest bucket. Check that the MSB iterator returns it.
+		final long highFP = 1L << 62;
+		msbDiskFPSet.put(highFP);
+		TLCIterator tlcIterator = new MSBDiskFPSet.TLCIterator(msbDiskFPSet.tbl);
+		assertEquals(highFP, tlcIterator.getLast());
+		
+		// Flush the set to disk (simulating e.g. a checkpoint), a new iterator
+		// won't find the element anymore because it intentionally only searches
+		// for elements that are *not* on disk.
+		msbDiskFPSet.flusher.flushTable();
+		new MSBDiskFPSet.TLCIterator(msbDiskFPSet.tbl);
+		try {
+			tlcIterator.getLast();
+		} catch (NoSuchElementException e) {
+			// This exception is expected.
+			
+			// Now add the smallest possible element into the set. It will end
+			// up in the smallest bucket.
+			final long lowFP = 1;
+			msbDiskFPSet.put(lowFP);
+			// check that the iterator finds lower bound as the last element.
+			tlcIterator = new MSBDiskFPSet.TLCIterator(msbDiskFPSet.tbl);
+			final long lowerBound = highFP - 1L;
+			assertEquals(lowerBound, tlcIterator.getLast(lowerBound));
+			return;
+		}
+		fail();
+	}
 
 	private MSBDiskFPSet getMSBDiskFPSet() throws RemoteException, IOException {
 		// Create an MSBDiskFPSet usable in this unit test with memory allocated
diff --git a/tlatools/test/tlc2/tool/fp/iterator/TLCIterator2Test.java b/tlatools/test/tlc2/tool/fp/iterator/TLCIterator2Test.java
new file mode 100644
index 000000000..1ac9f4ed0
--- /dev/null
+++ b/tlatools/test/tlc2/tool/fp/iterator/TLCIterator2Test.java
@@ -0,0 +1,32 @@
+// Copyright (c) 2012 Microsoft Corporation.  All rights reserved.
+
+package tlc2.tool.fp.iterator;
+
+public class TLCIterator2Test extends TLCIteratorTest {
+
+	/* (non-Javadoc)
+	 * @see tlc2.tool.fp.iterator.TLCIteratorTest#getBuffer()
+	 */
+	@Override
+	protected long[][] getBuffer() {
+		final long[][] buff = new long[8][];
+		buff[0] = getArray(8, 1, 8);
+		buff[1] = getArray(8, 9, 6);
+		buff[2] = null;
+		buff[3] = null;
+		
+		buff[4] = getArray(8, 15, 7); // Bucket with last element
+		buff[5] = null;
+		
+		// Simulate that this bucket is filled with fingerprints who have all
+		// been flushed to disk.
+		buff[6] = new long[4];
+		buff[6][0] = 22L | 0x8000000000000000L;
+		buff[6][1] = 0L;
+		buff[6][2] = 0L;
+		buff[6][3] = 0L;
+		
+		buff[7] = null;
+		return buff;
+	}
+}
diff --git a/tlatools/test/tlc2/tool/fp/iterator/TLCIteratorTest.java b/tlatools/test/tlc2/tool/fp/iterator/TLCIteratorTest.java
index b15ac8788..07f347f36 100644
--- a/tlatools/test/tlc2/tool/fp/iterator/TLCIteratorTest.java
+++ b/tlatools/test/tlc2/tool/fp/iterator/TLCIteratorTest.java
@@ -94,4 +94,16 @@ public class TLCIteratorTest extends TestCase {
 	public void testGetLast() {
 		assertEquals(getLast(), itr.getLast());
 	}
+
+	public void testGetLastLowBound() {
+		assertEquals(getLast() + 1, itr.getLast(getLast() + 1));
+	}
+	
+	/*
+	 * Use a smaller lower bound element than last in the buffer.
+	 */
+	public void testGetLastLowerBoundNoHit() {
+		assertEquals(getLast(), itr.getLast(20));
+	}
+
 }
-- 
GitLab