in Public/Src/Cache/ContentStore/Hashing/Chunking/RegressionChunking.cs [261:1066]
private unsafe void FindRabinChunkBoundariesInternal(
byte* pStartBuffer, // Pointer to the BYTE buffer of data to be chunked in a sequence of FindRabinChunkBoundariesInternal() calls
size_t cbLen, // Length of the data to be chunked in a sequence of FindRabinChunkBoundariesInternal() calls
bool bNoMoreData, // If TRUE, this is the last call in the sequence of FindRabinChunkBoundariesInternal() calls on this data
ref size_t previouslyProcessedBytesParam, // Temporary state between calls FindRabinChunkBoundariesInternal(). Amount of previously processed bytes since the last recorded chunk
ref size_t lastChunkAbsoluteOffsetParam // Temporary state between calls FindRabinChunkBoundariesInternal(). Offset of the last inserted chunk, relative to the overall buffer in FindRabinChunkBoundaries()
)
{
HashValueT[] g_arrPolynomialsTD = Rabin64Table.g_arrPolynomialsTD;
HashValueT[] g_arrPolynomialsTU = Rabin64Table.g_arrPolynomialsTU;
unchecked
{
NT_ASSERT(cbLen > 0);
NT_ASSERT(pStartBuffer != null);
NT_ASSERT(previouslyProcessedBytesParam <= m_nMaxChunkSize);
//
// Buffer validation support
//
// Used to define the end of buffer (the first byte beyond the addressable pStartBuffer)
// Used only for DDP_ASSERT_VALID_XXX asserts
byte* pEndBuffer = pStartBuffer + cbLen;
NT_ASSERT(pStartBuffer < pEndBuffer); // Ensure that we don't have arithmetic overrun
// Using local functions instead of delegates, because the local functions are slightly more efficient
// because they can avoid creating a closure at runtime.
void DDP_ASSERT_VALID_BUFFER_POINTER(byte* pTestedPointer)
{
unchecked
{
DDP_BUFFER_RANGE_ASSERT(pTestedPointer, pStartBuffer, pEndBuffer);
}
}
void DDP_ASSERT_VALID_BUFFER_END(byte* pTestedPointer)
{
unchecked
{
DDP_BUFFER_RANGE_ASSERT((byte*)pTestedPointer, pStartBuffer + 1, pEndBuffer + 1);
}
}
void DDP_ASSERT_VALID_START_POINTER(byte* pTestedPointer)
{
unchecked
{
DDP_BUFFER_RANGE_ASSERT((byte*)pTestedPointer + m_nWindowSize, pStartBuffer, pEndBuffer + 1);
}
}
void DDP_ASSERT_VALID_END_POINTER(byte* pTestedPointer)
{
unchecked
{
DDP_BUFFER_RANGE_ASSERT((byte*)pTestedPointer, pStartBuffer, pEndBuffer + 1);
}
}
//bool DDP_IS_VALID_POINTER(IntPtr pTestedPointer) =>
// unchecked((((byte*)pTestedPointer >= pStartBuffer) && ((byte*)pTestedPointer < pEndBuffer)));
//
// Local state variables (for this call)
//
// During chunking, this is the index where we start the "look-ahead". This offset is relative to the beginning of the current buffer
// Note: this value should always be positive
OffsetT startOffset = 0;
// Amount of bytes available in the buffer for chunking/analysis from startOffset to the end of the buffer. Equal or smaller than the size of the buffer
// Note: this value should be always positive
OffsetT remainingBytes = (OffsetT)cbLen;
// Amount of previously processed bytes (since we recorded the last chunk) until the startOffset. Represents bytes from the previous call, in the initial chunking iteration
size_t previouslyProcessedBytes = previouslyProcessedBytesParam;
// Offset of the last inserted chunk relative to the file stream (i.e. to the beginning of the virtual buffer spanning the sequence of calls)
size_t lastChunkAbsoluteOffset = lastChunkAbsoluteOffsetParam;
// Holds the currently computed Rabin hash value over the current window. At the end of thus call, we will move this value back to m_hash
HashValueT hash = m_hash;
/*
// Per-instance state members (passed across calls)
LONGLONG m_numZeroRun - // Size of continuous zeros after last chunk up to the current offset.
// >= 0, a run is counted (# of consecutive zeros)
// < 0, a run has been interrupted, i.e., encounter at least one none zero values.
HashValueT m_hash; // Last hash value
BYTE m_history[m_nWindowSize]; // Last bytes from the previous chunk. Used to reinitialize the state machine for chunking (size = window size)
OffsetT m_regressChunkLen[m_nRegressSize];
OffsetT m_lastNonZeroChunkLen;
TODO:365262 - simplify the code by allocating a larger buffer and copying the data to ensure a continuous operating buffer
*/
HashMaskMatch smallestMask = _smallestMask; // If the least significant N bytes from the Rabin hash are equal to this value, we declare a "cut" (N depends on the context)
HashMaskMatch[] regressionMasks = _regressionMasks; // Same as above, but for each regression step.
//
// Chunking loop
//
while (remainingBytes > 0)
{
NT_ASSERT(startOffset >= 0);
//
// Check to see if the available bytes (remaining + previous) are insufficient to create ore "real" (i.e. non-minimum) chunks
//
// If the remaining bytes plus the previous "partial chunk" are less than the minimum chunk size, wrap up and exit
size_t remainingBytesToBeReported = (size_t)(remainingBytes) + previouslyProcessedBytes;
NT_ASSERT(remainingBytesToBeReported > 0);
if (remainingBytesToBeReported < m_nMinChunkSize)
{
// If we had a zero run previously, check to see if all the remaining bytes are all zeros
// If yes, m_numZeroRun will be the size of the last "zero chunk"
if (m_numZeroRun >= 0)
{
NT_ASSERT(m_numZeroRun == (OffsetT)(previouslyProcessedBytes));
byte* pStartZeroTest = pStartBuffer + startOffset;
DDP_ASSERT_VALID_BUFFER_POINTER(pStartZeroTest);
// Check how many subsequent consecutive bytes are zeros
OffsetT remainingNonZero = remainingBytes;
// TODO:365262 - move this in a C++ utility routine
// Note: we used DDP_IS_VALID_POINTER to "hide" OACR failures as Prefast can't keep up with the large list of asumptions (known bug)
while (DDP_IS_VALID_POINTER(pStartZeroTest, pStartBuffer, pEndBuffer) && (remainingNonZero > 0) && ((*pStartZeroTest) == 0))
{
remainingNonZero--;
pStartZeroTest++;
}
// Check if we found a non-zero byte
if (remainingNonZero > 0)
{
// A non-zero byte was encountered
m_numZeroRun = -1;
}
else
{
NT_ASSERT(pStartZeroTest <= pEndBuffer);
// All remaining bytes were zeros
m_numZeroRun += remainingBytes;
NT_ASSERT(m_numZeroRun == (OffsetT)(remainingBytesToBeReported));
}
}
// Check if this is the last chunk in the last call sequence
if (bNoMoreData)
{
// Add the final chunk, as its size is smaller than the minimum chunk size
// TODO:365262 - use here DDP_CCT_MinReached always (technically this is a MinReached) plus mix DDP_CCT_MinReached and DDP_CCT_All_Zero as flags
// TODO:365262 - use AddChunk
AddChunkInfo(
new DedupBasicChunkInfo(
lastChunkAbsoluteOffset,
remainingBytesToBeReported,
(m_numZeroRun >= 0) ? DedupChunkCutType.DDP_CCT_All_Zero : DedupChunkCutType.DDP_CCT_MinReached));
//
// Reset all state related with the cross-call sequence
//
// TODO:365262 - move the state reset in a utility
// TODO:365262 - reset other members such as m_numZeroRun, m_regressChunk, etc
hash = 0;
previouslyProcessedBytes = 0;
lastChunkAbsoluteOffset += remainingBytesToBeReported;
m_numZeroRun = 0;
// TODO:365262 - assert that we reached the end of the buffer
// TODO:365262 - cleanup alg (visible exit here for code clarity)
}
else
{
// This is a "partial" chunk - the remainder will be processed in the next call
previouslyProcessedBytes += (size_t)(remainingBytes);
}
// Add remainingBytes to the processed data
startOffset += remainingBytes;
remainingBytes = 0;
// Nothing left to process in this call. Exit the loop
break;
}
//
// Given the treatment above, available bytes (remaining + previous) is at least MinChunkSize. Chunking can now proceed
//
NT_ASSERT(remainingBytesToBeReported >= m_nMinChunkSize);
// Calculate the amount of bytes that could be skipped (since we can skip in the hash calculation the first m_nMinChunkSize bytes)
OffsetT bytesToBeSkipped = (OffsetT)(m_nMinChunkSize) - (OffsetT)(previouslyProcessedBytes);
NT_ASSERT(remainingBytes >= bytesToBeSkipped); // should be always true given the "if" test above
//
// Calculate start window
//
OffsetT initialStartWindow = 0; // Beginning of the byte window for Rabin hash calculation. This offset is relative to the beginning of the buffer
// Check if we need to perform a "jump" in the data since, if we just made a chunk cut earlier, we can skip the current offset beyond the m_nMinChunkSize since the last cut
// Note - since both values are either zero or positive, this test essentially checks if at least one of the values is non-zero
if (startOffset + bytesToBeSkipped > 0)
{
// End of the byte window for Rabin hash calculation. This offset is relative to the beginning of the buffer
OffsetT initialEndWindow = startOffset + bytesToBeSkipped;
// Note: this can end up slightly negative (but within the window size). This is OK.
initialStartWindow = initialEndWindow - (OffsetT)(m_nWindowSize);
// Add zero run detection up to the end of the window, including bytes to be skipped
if (m_numZeroRun >= 0)
{
// Scan till end window, is it all zeros?
byte* pStartZeroTest = pStartBuffer + startOffset;
DDP_ASSERT_VALID_BUFFER_POINTER(pStartZeroTest);
byte* pEndPosZeroTest = pStartBuffer + initialEndWindow;
DDP_ASSERT_VALID_BUFFER_END(pEndPosZeroTest);
// Note: we used DDP_IS_VALID_POINTER to "hide" OACR failures as Prefast can't keep up with the large list of asumptions (known bug)
while (DDP_IS_VALID_POINTER(pStartZeroTest, pStartBuffer, pEndBuffer) && (pStartZeroTest != pEndPosZeroTest) && ((*pStartZeroTest) == 0))
{
pStartZeroTest++;
}
// Note: here m_numZeroRun can go beyond previouslyProcessedBytes
if (pStartZeroTest != pEndPosZeroTest)
{
DDP_ASSERT_VALID_BUFFER_POINTER(pStartZeroTest);
m_numZeroRun = -1;
}
else
{
NT_ASSERT(initialEndWindow >= startOffset);
m_numZeroRun += initialEndWindow - (OffsetT)(startOffset);
}
}
// We have to make a jump so the previous hash context is lost. Recalculate the hash starting from the new position
hash = 0;
// Start the hash calculation from the history if the beginning of the window falls outside the current buffer
// This could happen if m_nMinChunkSize is just a few bytes below previouslyProcessedBytes, and the content of these bytes were in the previous buffer
OffsetT currentStartIndex = initialStartWindow;
for (; currentStartIndex < 0; currentStartIndex++)
{
HashValueT origHash = hash;
hash <<= 8;
OffsetT nHistoryIndex = currentStartIndex + (long)m_nWindowSize;
DDP_ASSERT_VALID_ARRAY_INDEX(nHistoryIndex, m_history);
hash ^= m_history[nHistoryIndex];
hash ^= g_arrPolynomialsTD[(origHash >> 56) & 0xff];
}
// Perform the Rabin hash calculation on the remaining bytes in the window
NT_ASSERT(currentStartIndex >= 0);
byte* pbMark = pStartBuffer + currentStartIndex;
DDP_ASSERT_VALID_BUFFER_POINTER(pbMark);
// Compute the hash for the remaining bytes (within the window) in the actual buffer
for (; DDP_IS_VALID_POINTER(pbMark, pStartBuffer, pEndBuffer) && (currentStartIndex < initialEndWindow) && (pbMark < pEndBuffer); currentStartIndex++)
{
HashValueT origHash = hash;
hash <<= 8;
hash ^= *pbMark;
hash ^= g_arrPolynomialsTD[(origHash >> 56) & 0xff];
pbMark++;
}
// Reset m_regressChunkLen array as we just did a jump
// TODO:365262 - move this to a utility routine (note: also used in the constructor)
for (size_t regressIndex = 0; regressIndex < m_nRegressSize; regressIndex++)
{
DDP_ASSERT_VALID_ARRAY_INDEX(regressIndex, m_regressChunkLen);
m_regressChunkLen[regressIndex] = -1;
}
m_lastNonZeroChunkLen = -1;
}
else
{
initialStartWindow = -(OffsetT)(m_nWindowSize);
}
//
// Get pointers to the beginning of the window start/end
//
// Pointer to the start of the window
byte* pStartWindow = pStartBuffer + initialStartWindow;
DDP_ASSERT_VALID_START_POINTER(pStartWindow);
// Pointer to the end of the window
byte* pEndWindow = pStartWindow + m_nWindowSize;
DDP_ASSERT_VALID_END_POINTER(pEndWindow);
// Pointer to the byte where the maximum chunk size is hit (or to the end of the current buffer, if the max is beyond reach)
NT_ASSERT(m_nMaxChunkSize > previouslyProcessedBytes);
OffsetT bytesUntilMax = (OffsetT)(m_nMaxChunkSize) - (OffsetT)(previouslyProcessedBytes);
byte* pEndPosUntilMax = pStartBuffer + Math.Min((OffsetT)cbLen, startOffset + bytesUntilMax);
DDP_ASSERT_VALID_BUFFER_END(pEndPosUntilMax);
// Continue the zero run detection until pEndPosUntilMax
if (m_numZeroRun >= 0)
{
// Note: m_numZeroRun can be larger if we performed a jump (due to zero detection during the jump)
NT_ASSERT(m_numZeroRun >= (OffsetT)(previouslyProcessedBytes));
bool bDeclareChunk = false;
// Find the first non-zero
// TODO:365262 consider use utility routine
// Note: we used DDP_IS_VALID_POINTER to "hide" OACR failures as Prefast can't keep up with the large list of asumptions (known bug)
byte* pPreviousEndWindow = pEndWindow;
while (DDP_IS_VALID_POINTER(pEndWindow, pStartBuffer, pEndBuffer) && (pEndWindow != pEndPosUntilMax) && ((*pEndWindow) == 0))
{
pEndWindow++;
}
DDP_ASSERT_VALID_END_POINTER(pEndWindow);
// Get the amount of zero bytes just discovered, and update m_numZeroRun
NT_ASSERT(pEndWindow >= pPreviousEndWindow);
size_t zeroScanned = (size_t)(pEndWindow - pPreviousEndWindow);
m_numZeroRun += (OffsetT)zeroScanned;
// Update the number of processed bytes
// This includes the bytes discovered above, and the zero-run discovered by "jumping"
OffsetT zeroBytes = m_numZeroRun - (OffsetT)(previouslyProcessedBytes);
// Check if we need to record a new chunk
if (pEndWindow == pEndPosUntilMax)
{
// All zeros in this run
// Check if we reached the end of the buffer without reaching the maximum chunk size
if (m_numZeroRun < (OffsetT)(m_nMaxChunkSize))
{
// We reached the end of the buffer
NT_ASSERT(pEndPosUntilMax == (pStartBuffer + cbLen));
if (bNoMoreData)
{
NT_ASSERT(m_numZeroRun > 0);
bDeclareChunk = true;
}
else
{
// We need to exit as we are at the end of the buffer
}
}
else
{
NT_ASSERT(m_numZeroRun >= (OffsetT)(m_nMaxChunkSize));
bDeclareChunk = true;
}
}
else
{
DDP_ASSERT_VALID_END_POINTER(pEndWindow);
NT_ASSERT(m_numZeroRun >= (OffsetT)(m_nMinChunkSize));
bDeclareChunk = true;
}
if (bDeclareChunk)
{
// TODO:365262 - use AddChunk
NT_ASSERT(m_numZeroRun > 0);
AddChunkInfo(
new DedupBasicChunkInfo(
lastChunkAbsoluteOffset,
(size_t)(m_numZeroRun),
DedupChunkCutType.DDP_CCT_All_Zero));
hash = 0;
previouslyProcessedBytes = 0;
lastChunkAbsoluteOffset += (size_t)(m_numZeroRun);
m_numZeroRun = 0;
startOffset += zeroBytes;
remainingBytes -= zeroBytes;
NT_ASSERT(remainingBytes >= 0);
continue;
}
else
{
// TODO:365262 - exit here from the routine
// No chunk cut yet as we reached the end of the buffer while counting zeros
// Update state to incorporate the zeros we just found
startOffset += zeroBytes;
remainingBytes -= zeroBytes;
NT_ASSERT(remainingBytes == 0); // We need to exit as we can't declare a chunk yet
NT_ASSERT(m_numZeroRun >= 0);
previouslyProcessedBytes = (size_t)(m_numZeroRun);
continue;
}
}
//
// We are done with zero detection.
// Perform the actual hasing + chunking.
//
bool bLoopForNextHashValue = true;
while (bLoopForNextHashValue)
{
NT_ASSERT(pStartWindow < pEndWindow);
// Advance hash calculation when the start is from the previous buffer, and the end is in the current buffer
// TODO:365262 potential perf improvement
while (!smallestMask.IsMatch(hash) &&
pEndWindow < pEndPosUntilMax &&
initialStartWindow < 0)
{
// use history
// TODO:365262 - add index check for g_arrPolynomialsXX (static assert)
DDP_ASSERT_VALID_ARRAY_INDEX(initialStartWindow + (OffsetT)m_nWindowSize, m_history);
hash ^= g_arrPolynomialsTU[m_history[initialStartWindow + (OffsetT)m_nWindowSize]];
HashValueT origHash = hash;
hash <<= 8;
hash ^= *pEndWindow;
hash ^= g_arrPolynomialsTD[(origHash >> 56) & 0xff];
pStartWindow++;
pEndWindow++;
initialStartWindow++;
}
DDP_ASSERT_VALID_START_POINTER(pStartWindow);
DDP_ASSERT_VALID_END_POINTER(pEndWindow);
// Advance calculation while both window ends are in the same buffer
// TODO:365262 potential perf improvement
while (pEndWindow < pEndPosUntilMax && !smallestMask.IsMatch(hash))
{
// the main critical loop
hash ^= g_arrPolynomialsTU[*pStartWindow];
HashValueT origHash = hash;
hash <<= 8;
hash ^= *pEndWindow;
hash ^= g_arrPolynomialsTD[origHash >> 56];
pStartWindow++;
pEndWindow++;
// Note: we do need not to increment initialStartWindow anymore here (as this was related with the initial setup)
}
DDP_ASSERT_VALID_START_POINTER(pStartWindow);
DDP_ASSERT_VALID_END_POINTER(pEndWindow);
// Processed bytes starting from the current startOffset. Equal or smaller than the size of the buffer
NT_ASSERT(pEndWindow >= pStartBuffer + startOffset);
size_t processedBytes = (size_t)(pEndWindow - (pStartBuffer + startOffset));
// Length of the "potential chunk" we found
size_t chunkLen = processedBytes + previouslyProcessedBytes;
NT_ASSERT(chunkLen <= m_nMaxChunkSize);
NT_ASSERT(chunkLen >= m_nMinChunkSize);
// Check if a hash-driven chunk cut was made (using the smallest hash/mask)
if (smallestMask.IsMatch(hash))
{
// TODO:365262 use a utility routine
OffsetT regressHashMismatchIndex = (OffsetT)m_nRegressSize - 1;
for (; regressHashMismatchIndex >= 0; regressHashMismatchIndex--)
{
// Find the last mask close to m_nMaxChunkSize
// TODO:365262 array index check
// TODO:365262 Refactor to eliminate the confusing "offset by 1" difference between the two arrays
m_regressChunkLen[regressHashMismatchIndex] = (OffsetT)chunkLen;
if (!regressionMasks[regressHashMismatchIndex].IsMatch(hash))
break;
}
// If we had a match all the way it means that we encountered a match with the full-lenght hash value
if (regressHashMismatchIndex < 0)
{
// Find full mask match
// TODO:365262 - use AddChunk
AddChunkInfo(
new DedupBasicChunkInfo(lastChunkAbsoluteOffset, chunkLen, DedupChunkCutType.DDP_CCT_Normal));
hash = 0;
previouslyProcessedBytes = 0;
m_numZeroRun = 0;
lastChunkAbsoluteOffset += chunkLen;
bLoopForNextHashValue = false;
startOffset += (OffsetT)processedBytes;
remainingBytes -= (OffsetT)processedBytes;
break;
}
else
{
// Not a full mask match, repeat the logic in the main critical loop,
// We will need to move the pointer forward by one byte, otherwise, it will stuck in the loop
if (pEndWindow < pEndPosUntilMax)
{
if (initialStartWindow < 0)
{
DDP_ASSERT_VALID_ARRAY_INDEX(initialStartWindow + (OffsetT)m_nWindowSize, m_history);
hash ^= g_arrPolynomialsTU[m_history[initialStartWindow + (OffsetT)m_nWindowSize]];
}
else if (DDP_IS_VALID_POINTER(pStartWindow, pStartBuffer, pEndBuffer))
{
hash ^= g_arrPolynomialsTU[*pStartWindow];
}
else
{
NT_ASSERT(false);
}
if (DDP_IS_VALID_POINTER(pEndWindow, pStartBuffer, pEndBuffer))
{
HashValueT origHash = hash;
hash <<= 8;
hash ^= *pEndWindow;
hash ^= g_arrPolynomialsTD[(origHash >> 56) & 0xff];
}
else
{
NT_ASSERT(false);
}
pStartWindow++;
DDP_ASSERT_VALID_START_POINTER(pStartWindow);
pEndWindow++;
DDP_ASSERT_VALID_END_POINTER(pEndWindow);
initialStartWindow++;
continue; // To loop for next hash value
}
else
{
// We found a mismatch on a larger chunk but we also reach the end of the chunk. We need to continue with regression
NT_ASSERT(pEndWindow == pEndPosUntilMax);
}
}
};
// We reach pEndWindow == pEndPosUntilMax condition.
NT_ASSERT(pEndWindow == pEndPosUntilMax);
// Perform regression
if (m_nMaxChunkSize == chunkLen)
{
// Run to m_nMaxChunkSize
size_t curChunkLen = chunkLen;
DedupChunkCutType cutType = DedupChunkCutType.DDP_CCT_Unknown;
size_t lowestValidRegressionIndex;
for (lowestValidRegressionIndex = 0; lowestValidRegressionIndex < m_nRegressSize; lowestValidRegressionIndex++)
{
if (m_regressChunkLen[lowestValidRegressionIndex] >= 0)
break;
}
size_t potentiallyRegressedChunkLen = chunkLen;
// If we found a length we can regress to, get the chunk length and type
// Note: this should work if it regresses in Previous Bytes
if (lowestValidRegressionIndex < m_nRegressSize)
{
// We find at least one chunk point with a partial mask match
// TODO:365262 - add better type inference that works with a variable-length regression array
switch (lowestValidRegressionIndex)
{
case 0: cutType = DedupChunkCutType.DDP_CCT_Regress_1_bit; break;
case 1: cutType = DedupChunkCutType.DDP_CCT_Regress_2_bit; break;
case 2: cutType = DedupChunkCutType.DDP_CCT_Regress_3_bit; break;
case 3: cutType = DedupChunkCutType.DDP_CCT_Regress_4_bit; break;
default:
NT_ASSERT(false);
break;
}
potentiallyRegressedChunkLen = (size_t)(m_regressChunkLen[lowestValidRegressionIndex]);
m_regressChunkLen[lowestValidRegressionIndex] = -1;
// Adjust the length of subsequent regressions
size_t subsequentRegressionIndexes = lowestValidRegressionIndex + 1;
for (; subsequentRegressionIndexes < m_nRegressSize; subsequentRegressionIndexes++)
{
// All regress point match less bit is adjusted here.
NT_ASSERT(m_regressChunkLen[subsequentRegressionIndexes] >= (OffsetT)(potentiallyRegressedChunkLen));
m_regressChunkLen[subsequentRegressionIndexes] -= (OffsetT)potentiallyRegressedChunkLen;
if (m_regressChunkLen[subsequentRegressionIndexes] < (OffsetT)(m_nMinChunkSize))
{
m_regressChunkLen[subsequentRegressionIndexes] = -1;
}
}
// TODO:365262 - clean up the algorithm
if (m_lastNonZeroChunkLen != -1)
{
if (m_lastNonZeroChunkLen >= (OffsetT)(potentiallyRegressedChunkLen))
{
m_lastNonZeroChunkLen -= (OffsetT)potentiallyRegressedChunkLen;
if (m_lastNonZeroChunkLen < (OffsetT)(m_nMinChunkSize))
m_lastNonZeroChunkLen = -1;
}
else
{
m_lastNonZeroChunkLen = -1;
}
}
}
else
{
cutType = DedupChunkCutType.DDP_CCT_MaxReached;
}
NT_ASSERT(DedupChunkCutType.DDP_CCT_Unknown != cutType);
// TODO:365262 - use AddChunk
AddChunkInfo(
new DedupBasicChunkInfo(lastChunkAbsoluteOffset, potentiallyRegressedChunkLen, cutType));
// Note: processedBytes is still relative to curChunkLen not potentiallyRegressedChunkLen
// If this is the last call in teh dcall sequence, one more chunk will be added at the end
previouslyProcessedBytes = (size_t)((OffsetT)(curChunkLen) - (OffsetT)(potentiallyRegressedChunkLen));
lastChunkAbsoluteOffset += potentiallyRegressedChunkLen;
startOffset += (OffsetT)processedBytes;
remainingBytes -= (OffsetT)processedBytes;
// Recalculate m_numZeroRun
// TODO:365262 cleanup algorithm
if (previouslyProcessedBytes >= m_nWindowSize)
{
m_numZeroRun = -1; // This can't be part of a zero run, otherwise it will match the full 16 bit mask.
}
else
{
OffsetT nExaminePos = startOffset - 1;
size_t prevBytesAdj = previouslyProcessedBytes;
for (; prevBytesAdj > 0; prevBytesAdj--, nExaminePos--)
{
if (nExaminePos >= 0)
{
DDP_ASSERT_VALID_BUFFER_POINTER((pStartBuffer + nExaminePos));
if (pStartBuffer[nExaminePos] != 0)
break;
}
else
{
DDP_ASSERT_VALID_ARRAY_INDEX(nExaminePos + (OffsetT)m_nWindowSize, m_history);
if (m_history[nExaminePos + (OffsetT)m_nWindowSize] != 0)
break;
}
}
if (prevBytesAdj > 0)
m_numZeroRun = -1;
else
m_numZeroRun = (OffsetT)previouslyProcessedBytes;
}
if (previouslyProcessedBytes >= m_nMinChunkSize)
{
// Loop can continue runs.
bLoopForNextHashValue = true;
// Update pEndPosUntilMax
// TODO:365262 - cleanup algorithm
pEndPosUntilMax = pStartBuffer + Math.Min((OffsetT)cbLen, startOffset + (OffsetT)(m_nMaxChunkSize) - (OffsetT)(previouslyProcessedBytes));
DDP_ASSERT_VALID_BUFFER_END(pEndPosUntilMax);
}
else
{
// Need to exit loop
bLoopForNextHashValue = false;
}
}
else if (bNoMoreData)
{
// We can't apply regression as we are reaching the end
NT_ASSERT(chunkLen > 0);
// TODO:365262 - use AddChunk
// TODO:MSR - we should declare the "end" consistently (right now we use both MinReached and EndReached)
AddChunkInfo(
new DedupBasicChunkInfo(lastChunkAbsoluteOffset, chunkLen, DedupChunkCutType.DDP_CCT_EndReached));
hash = 0;
previouslyProcessedBytes = 0;
m_numZeroRun = 0;
lastChunkAbsoluteOffset += chunkLen;
startOffset += (OffsetT)processedBytes;
remainingBytes -= (OffsetT)processedBytes;
bLoopForNextHashValue = false;
// TODO:365262 - add a more visible exit
}
else // more data but no regression
{
// Find first non zero position, make sure we don't run before pStartBuffer
byte* pReverseScan = pEndWindow;
DDP_ASSERT_VALID_END_POINTER(pReverseScan);
byte* pReverseScanStop = pEndPosUntilMax - Math.Min((OffsetT)(chunkLen) - (OffsetT)(m_nMinChunkSize) + 1, pEndPosUntilMax - pStartBuffer);
DDP_ASSERT_VALID_END_POINTER(pReverseScanStop);
do
{
pReverseScan--;
// DDP_ASSERT_VALID_BUFFER_POINTER((IntPtr)pReverseScan);
}
while (DDP_IS_VALID_POINTER(pReverseScan, pStartBuffer, pEndBuffer) && ((*pReverseScan) == 0) && (pReverseScan >= pReverseScanStop));
// Note: we used DDP_IS_VALID_POINTER to "hide" OACR failures as Prefast can't keep up with the large list of asumptions (known bug)
if (pReverseScan >= pReverseScanStop)
{
// A first non zero position is found, we chunk at the first zero position afterwards.
m_lastNonZeroChunkLen = (OffsetT)(chunkLen) - (pEndPosUntilMax - pReverseScan - 1);
NT_ASSERT(m_lastNonZeroChunkLen >= (OffsetT)(m_nMinChunkSize));
NT_ASSERT(m_lastNonZeroChunkLen <= (OffsetT)(m_nMaxChunkSize));
};
previouslyProcessedBytes += processedBytes;
startOffset += (OffsetT)processedBytes;
remainingBytes -= (OffsetT)processedBytes;
bLoopForNextHashValue = false;
}
}
}
NT_ASSERT(remainingBytes >= 0);
// Add one last chunk, if this is the last call and the last regression run left some "left-over" buffer
if ((previouslyProcessedBytes > 0) && bNoMoreData)
{
// End reached, no more data available
DedupChunkCutType endChunkType = (m_numZeroRun >= 0)
? DedupChunkCutType.DDP_CCT_All_Zero
: (previouslyProcessedBytes < m_nMinChunkSize)
? DedupChunkCutType.DDP_CCT_MinReached
: DedupChunkCutType.DDP_CCT_EndReached;
AddChunkInfo(new DedupBasicChunkInfo(lastChunkAbsoluteOffset, previouslyProcessedBytes, endChunkType));
hash = 0;
lastChunkAbsoluteOffset += previouslyProcessedBytes;
previouslyProcessedBytes = 0;
m_numZeroRun = 0;
}
//
// Save internal state for the next CDedupRegressionChunking::FindRabinChunkBoundariesInternal call
//
OffsetT bytesIndex = 0;
for (; bytesIndex < Math.Min(0, (OffsetT)(m_nWindowSize) - (OffsetT)(cbLen)); bytesIndex++)
{
DDP_ASSERT_VALID_ARRAY_INDEX(bytesIndex, m_history);
DDP_ASSERT_VALID_ARRAY_INDEX(bytesIndex + (OffsetT)cbLen, m_history);
m_history[bytesIndex] = m_history[bytesIndex + (OffsetT)cbLen];
}
for (; bytesIndex < (OffsetT)m_nWindowSize; bytesIndex++)
{
DDP_ASSERT_VALID_BUFFER_POINTER((pStartBuffer + cbLen - m_nWindowSize + bytesIndex));
DDP_ASSERT_VALID_ARRAY_INDEX(bytesIndex, m_history);
m_history[bytesIndex] = pStartBuffer[(OffsetT)(cbLen) - (OffsetT)(m_nWindowSize) + bytesIndex];
}
m_hash = hash;
// Save the amount of unprocessed bytes
previouslyProcessedBytesParam = previouslyProcessedBytes;
// Save the last absolute chunk offset
NT_ASSERT(lastChunkAbsoluteOffset >= lastChunkAbsoluteOffsetParam);
lastChunkAbsoluteOffsetParam = lastChunkAbsoluteOffset;
}
}