in DuplicateCodeDetector/CloneDetector.cs [74:104]
public void BuildIndexForProject(string parsedJsonlPath)
{
var projectIndex = new Dictionary<string, SparseVector>();
_index.Add(parsedJsonlPath, projectIndex);
using (var stream = new FileStream(parsedJsonlPath, FileMode.Open))
using (var uncompressed = new GZipStream(stream, CompressionMode.Decompress))
using (var text = new StreamReader(uncompressed))
{
string line = text.ReadLine();
while (line != null)
{
if (line == "null")
{
line = text.ReadLine();
continue;
}
var tokenData = JsonConvert.DeserializeObject<IDictionary<string, object>>(line);
var tokenCounter = Count(((JArray)tokenData[_tokensFieldName]).Select(t=>t.ToString()));
if (tokenCounter.Sum(tc => tc.Count) >= MIN_NUM_TOKENS_FOR_FILE)
{
var spVect = new SparseVector();
spVect.AddElements(tokenCounter.Select(tc => (_dict.AddOrGet(tc.Token), tc.Count)));
var entryIdentifier = string.Join(":", _identifyingFields.Select(idf => tokenData[idf].ToString()));
projectIndex[entryIdentifier] = spVect;
}
line = text.ReadLine();
}
}
}