src/managed/DiffGen/ArchiveUtility/ArchiveTokenization.cs (553 lines of code) (raw):
/**
* @file ArchiveTokenization.cs
*
* @copyright Copyright (c) Microsoft Corporation.
* Licensed under the MIT License.
*/
namespace ArchiveUtility
{
using System;
using System.Collections.Generic;
using System.Diagnostics.CodeAnalysis;
using System.IO;
using System.Linq;
using System.Text;
using System.Text.Json;
using Microsoft.Extensions.Logging;
using RecipeLookup = System.Collections.Generic.Dictionary<ArchiveUtility.ItemDefinition, ArchiveUtility.Recipe>;
using SerializedPayloadList = System.Collections.Generic.List<System.Collections.Generic.KeyValuePair<ArchiveUtility.Payload, System.Collections.Generic.HashSet<ArchiveUtility.ItemDefinition>>>;
[SuppressMessage("Microsoft.StyleCop.CSharp.ReadabilityRules", "SA1121", Justification = "We want to be explicit about bit-width using these aliases.")]
public class ArchiveTokenization
{
public string Type { get; private set; }
public string Subtype { get; private set; }
public string WorkingFolder { get; set; }
public string ItemFolder { get => Path.Combine(WorkingFolder, "items"); }
public ItemDefinition ArchiveItem { get; set; } = null;
// Items it is ok to take a dependency on without being present
// in the tokenization. Used to allow diffs to depend upon
// the source item
public ItemDefinition SourceItem { get; set; } = null;
public SerializedPayloadList Payload { get => PayloadCatalog.Entries.ToList(); }
public IEnumerable<Recipe> Recipes { get => GetAllRecipes(); }
public RecipeLookup ForwardRecipes { get; set; } = new();
public RecipeLookup ReverseRecipes { get; set; } = new();
private PayloadCatalog PayloadCatalog = new();
private RecipeCatalog RecipeCatalog = new();
private HashSet<ItemDefinition> Items = new HashSet<ItemDefinition>();
private Dictionary<UInt64, HashSet<ItemDefinition>> ItemsBySize = new();
public ArchiveTokenization(string type, string subtype)
{
Type = type;
Subtype = subtype;
}
public ArchiveTokenization(string type, string subtype, ArchiveTokenization parentTokens)
{
Type = type;
Subtype = subtype;
PayloadCatalog = parentTokens.PayloadCatalog;
RecipeCatalog = parentTokens.RecipeCatalog;
ForwardRecipes = parentTokens.ForwardRecipes;
ReverseRecipes = parentTokens.ReverseRecipes;
ArchiveItem = parentTokens.ArchiveItem;
}
public ArchiveTokenization CreateDiffTokens(ArchiveTokenization sourceTokens)
{
ArchiveTokenization diffTokens = new("Diff", "Standard");
diffTokens.SourceItem = sourceTokens.ArchiveItem;
diffTokens.ArchiveItem = ArchiveItem;
foreach (Recipe recipe in ForwardRecipes.Values)
{
diffTokens.AddForwardRecipe(recipe);
}
return diffTokens;
}
public void SetPayload(SerializedPayloadList payloadList)
{
foreach (var payloadEntry in payloadList)
{
var payload = payloadEntry.Key;
foreach (var item in payloadEntry.Value)
{
PayloadCatalog.AddPayload(payload, item);
}
}
}
public void SetRecipes(List<Recipe> recipes)
{
foreach (var recipe in recipes)
{
AddRecipe(recipe);
}
}
private static bool HasItemImpl(ItemDefinition item)
{
return item != null && item.Length > 0;
}
public bool HasSourceItem()
{
return HasItemImpl(SourceItem);
}
public ItemDefinition InlineAssetsItem { get; set; } = null;
public bool HasInlineAssetsItem()
{
return HasItemImpl(InlineAssetsItem);
}
public ItemDefinition RemainderItem { get; set; } = null;
public bool HasRemainderItem()
{
return HasItemImpl(RemainderItem);
}
public bool IsSpecialItem(ItemDefinition item)
{
if (item.Equals(ArchiveItem))
{
return true;
}
if (HasInlineAssetsItem() && item.Equals(InlineAssetsItem))
{
return true;
}
if (HasRemainderItem() && item.Equals(RemainderItem))
{
return true;
}
return false;
}
public void AddRootPayload(string name, ItemDefinition item)
{
Payload payload = new(ArchiveItem, name);
PayloadCatalog.AddPayload(payload, item);
}
public bool HasRootPayload(string name)
{
Payload payload = new(ArchiveItem, name);
return PayloadCatalog.HasPayload(payload);
}
public bool HasPayloadWithName(string name) => PayloadCatalog.HasPayloadWithName(name);
public IEnumerable<ItemDefinition> GetPayloadWithName(string name) => PayloadCatalog.GetPayloadWithName(name);
public IEnumerable<ItemDefinition> GetPayloadMatchingWildcard(string name) => PayloadCatalog.GetPayloadMatchingWildcard(name);
public IEnumerable<ItemDefinition> GetRootPayload(string name)
{
Payload payload = new(ArchiveItem, name);
return PayloadCatalog.GetPayload(payload);
}
public void AddRecipe(Recipe recipe)
{
var result = recipe.Result;
AddItem(result);
var ingredients = recipe.ItemIngredients;
foreach (var ingredient in ingredients)
{
AddItem(ingredient);
}
RecipeCatalog.AddRecipe(recipe);
}
private void AddItem(ItemDefinition item)
{
Items.Add(item);
HashSet<ItemDefinition> itemsForThisSize = null;
if (!ItemsBySize.ContainsKey(item.Length))
{
itemsForThisSize = new();
ItemsBySize[item.Length] = itemsForThisSize;
}
else
{
itemsForThisSize = ItemsBySize[item.Length];
}
itemsForThisSize.Add(item);
}
public void AddRecipes(IEnumerable<Recipe> recipes)
{
foreach (var recipe in recipes)
{
AddRecipe(recipe);
}
}
public void AddForwardRecipe(Recipe recipe)
{
ForwardRecipes[recipe.Result] = recipe;
AddRecipe(recipe);
}
public void AddReverseRecipe(Recipe recipe)
{
ReverseRecipes[recipe.Result] = recipe;
AddRecipe(recipe);
}
public bool HasAnyRecipes(ItemDefinition item)
{
return RecipeCatalog.HasAnyRecipes(item);
}
public bool TryAddRecipe(Recipe recipe) => RecipeCatalog.TryAddRecipe(recipe);
public bool HasRecipe(Recipe recipe) => RecipeCatalog.HasRecipe(recipe);
public void ClearRecipes() => RecipeCatalog.ClearRecipes();
public IEnumerable<Recipe> GetRemainderRecipes() => RecipeCatalog.GetRecipesUsing(RemainderItem);
public IEnumerable<Recipe> GetInlineAssetRecipes() => RecipeCatalog.GetRecipesUsing(InlineAssetsItem);
public IEnumerable<Recipe> GetAllRecipes() => RecipeCatalog.GetAllRecipes();
public HashSet<ItemDefinition> GetAllPayloadItems() => PayloadCatalog.Entries.Values.SelectMany(x => x).ToHashSet();
// Serialization
public static JsonSerializerOptions GetStandardJsonSerializerOptions()
{
var options = new JsonSerializerOptions()
{
Converters =
{
new ArchiveTokenizationJsonConverter(),
new ItemDefinitionJsonConverter(),
new HashJsonConverter(),
new RecipeJsonConverter(),
},
MaxDepth = 128,
};
return options;
}
public static ArchiveTokenization FromJsonPath(string path)
{
using var stream = File.OpenRead(path);
return FromJson(stream);
}
public static ArchiveTokenization FromJson(Stream stream)
{
using var reader = new StreamReader(stream);
var jsonText = reader.ReadToEnd();
return FromJson(jsonText);
}
public static ArchiveTokenization FromJson(string jsonText)
{
var options = GetStandardJsonSerializerOptions();
var deserialized = JsonSerializer.Deserialize<ArchiveTokenization>(jsonText, options);
return deserialized;
}
public void WriteJson(Stream stream, bool writeIndented)
{
using var writer = new StreamWriter(stream, Encoding.UTF8, -1, true);
var jsonText = ToJson(writeIndented);
writer.Write(jsonText);
}
public string ToJson(bool writeIndented)
{
var options = GetStandardJsonSerializerOptions();
options.WriteIndented = writeIndented;
return JsonSerializer.Serialize(this, options);
}
public string ToJson()
{
return ToJson(false);
}
#pragma warning disable SA1010 // Opening square brackets should be spaced correctly
private static string[][] ArchiveExtensions = [[".ext4", "ext4"], [".ext2", "ext4"], [".ext3", "ext4"], [".tar", "tar"], [".cpio", "cpio"], [".swu", "swu"], [".zst", "zstd"], [".zstd", "zstd"], [".gz", "zip"]];
#pragma warning restore SA1010 // Opening square brackets should be spaced correctly
public void ProcessNestedArchives(Stream stream, ArchiveUseCase useCase)
{
// Not useful to get detailed view of nested items if we can't reconstruct this archive from them
if ((useCase == ArchiveUseCase.DiffTarget) && !ForwardRecipes.ContainsKey(ArchiveItem))
{
return;
}
List<Tuple<ItemDefinition, string, string>> potentialNestedArchives = new();
foreach (var payloadEntry in PayloadCatalog.Entries)
{
Payload payload = payloadEntry.Key;
if (!payload.ArchiveItem.Equals(ArchiveItem))
{
continue;
}
var payloadName = payload.Name;
foreach (var extInfo in ArchiveExtensions)
{
var ext = extInfo[0];
var archiveType = extInfo[1];
if (payloadName.EndsWith(ext, StringComparison.OrdinalIgnoreCase))
{
var payloadItems = payloadEntry.Value;
foreach (var item in payloadItems)
{
potentialNestedArchives.Add(new(item, payloadName, archiveType));
break;
}
}
}
}
using (FileFromStream file = new FileFromStream(stream, WorkingFolder))
{
string archivePath = file.Name;
if (!TryExtractItems(ArchiveLoaderContext.DefaultLogger, archivePath, potentialNestedArchives.Select(n => n.Item1)))
{
throw new Exception($"Couldn't extract items for nested archives for: {archivePath}");
}
}
foreach (var nested in potentialNestedArchives)
{
var (nestedItem, payloadName, type) = nested;
var archiveFile = nestedItem.GetExtractionPath(ItemFolder);
using var archiveStream = File.OpenRead(archiveFile);
ArchiveLoaderContext context = new(archiveStream, WorkingFolder, ArchiveLoaderContext.DefaultLogger, LogLevel.None)
{
UseCase = useCase,
};
context.OriginalArchiveFileName = payloadName;
if (ArchiveLoader.TryLoadArchive(context, out ArchiveTokenization tokens, type))
{
context.Logger?.LogInformation("Loaded nested archive of type: {type}", type);
string nestedJson = tokens.ArchiveItem.GetExtractionPath(context.WorkingFolder) + $".{type}.json";
context.Logger?.LogInformation("Writing nested json to {NestedJson}", nestedJson);
using (var nestedJsonStream = File.OpenWrite(nestedJson))
{
tokens.WriteJson(nestedJsonStream, true);
}
ImportArchive(context.Logger, tokens);
}
}
}
public HashSet<ItemDefinition> GetDependencies(ItemDefinition item, bool excludeSpecialItems)
{
HashSet<ItemDefinition> dependencies = new();
PopulateDependencies(item, dependencies, excludeSpecialItems);
return dependencies;
}
private record Chunk(ulong Offset, ItemDefinition Item);
private static Chunk ChunkFromSlice(Recipe recipe) => new Chunk(recipe.NumberIngredients[0], recipe.Result);
private List<Chunk> GetChunksFromRecipes()
{
var slices = RecipeCatalog.GetSlicesOf(ArchiveItem);
var chunks = slices.Select(x => ChunkFromSlice(x)).ToList();
return chunks;
}
private IEnumerable<Chunk> GetAllChunks(Stream stream)
{
var chunks = GetChunksFromRecipes();
// We don't want to create gap chunks if there are no already defined chunks
if (chunks.Count == 0)
{
return chunks;
}
var sortedChunks = chunks.OrderBy(x => x.Offset);
List<Chunk> gapChunks = new();
ulong expectedOffset = 0;
foreach (var chunk in sortedChunks)
{
if (expectedOffset < chunk.Offset)
{
stream.Seek((long)expectedOffset, SeekOrigin.Begin);
using var reader = new BinaryReader(stream, Encoding.ASCII, true);
ulong length = chunk.Offset - expectedOffset;
var chunksForThisGap = MakeChunksForGap(reader, expectedOffset, length);
gapChunks.AddRange(chunksForThisGap);
}
expectedOffset = chunk.Offset + chunk.Item.Length;
}
if (expectedOffset == 0)
{
throw new Exception("Found no chunks of archive. Expected Offset for last chunk is zero. This would result in a chunk of entire file.");
}
if (expectedOffset < ArchiveItem.Length)
{
stream.Seek((long)expectedOffset, SeekOrigin.Begin);
using var reader = new BinaryReader(stream, Encoding.ASCII, true);
ulong length = ArchiveItem.Length - expectedOffset;
var chunksForThisGap = MakeChunksForGap(reader, expectedOffset, length);
gapChunks.AddRange(chunksForThisGap);
}
chunks.AddRange(gapChunks);
return chunks.OrderBy(x => x.Offset);
}
private ItemDefinition CreateItemForGap(BinaryReader reader, ulong begin, ulong end)
{
ulong length = end - begin;
reader.BaseStream.Seek((long)begin, SeekOrigin.Begin);
ItemDefinition item = ItemDefinition.FromBinaryReader(reader, length).WithName(ChunkNames.MakeGapChunkName(begin, length));
bool allZeros = true;
ulong remaining = length;
const int read_block_size = 1024 * 8;
byte[] data = new byte[read_block_size];
reader.BaseStream.Seek((long)begin, SeekOrigin.Begin);
while (remaining > 0)
{
int toRead = (int)Math.Min(remaining, read_block_size);
var span = new Span<byte>(data, 0, toRead);
int actualRead = reader.Read(span);
if (actualRead != toRead)
{
throw new Exception($"Couldn't read gap data. Didn't read expected amount of bytes. Expected: {toRead}, Actual: {actualRead}");
}
if (!AsciiData.IsAllNul(span))
{
allZeros = false;
break;
}
remaining -= (ulong)actualRead;
}
if (allZeros)
{
Recipe allZerosRecipe = new (Recipe.RecipeTypeToString(RecipeType.AllZeros), item, new(), new());
AddForwardRecipe(allZerosRecipe);
}
Recipe sliceRecipe = new(RecipeType.Slice, item, new() { begin }, new List<ItemDefinition>() { ArchiveItem });
AddReverseRecipe(sliceRecipe);
return item;
}
private IEnumerable<Chunk> MakeChunksForGap(BinaryReader reader, ulong offset, ulong length)
{
var chunks = new List<Chunk>();
ulong gapBegin = offset;
ulong gapEnd = offset + length;
//Align gap chunks on 1024 bytes, to make matching easier
if (gapBegin % 1024 != 0)
{
ulong updatedGapBegin = ((gapBegin >> 10) + 1) << 10;
if (updatedGapBegin <= gapEnd)
{
var item = CreateItemForGap(reader, gapBegin, updatedGapBegin);
chunks.Add(new(gapBegin, item));
gapBegin = updatedGapBegin;
}
}
//the maximum size allowed for gap chunks
const ulong TEN_MB = 10 * (1 << 20);
for (ulong thisGapBegin = gapBegin, thisGapEnd; thisGapBegin < gapEnd; thisGapBegin = thisGapEnd)
{
thisGapEnd = Math.Min(gapEnd, thisGapBegin + TEN_MB);
var item = CreateItemForGap(reader, thisGapBegin, thisGapEnd);
chunks.Add(new(gapBegin, item));
}
return chunks;
}
public void HandleGapChunks(Stream stream)
{
var allChunks = GetAllChunks(stream);
var items = allChunks.Select(x => x.Item).ToList();
// Don't overwrite whatever the json already had
if (ForwardRecipes.ContainsKey(ArchiveItem))
{
return;
}
if (items.Count == 0)
{
return;
}
var totalItemLength = items.Sum(x => (long)x.Length);
if (totalItemLength != (long)ArchiveItem.Length)
{
throw new Exception($"Total items length for chain recipe when handling gaps mismatch. Item length: {ArchiveItem.Length}, Chain total item length: {totalItemLength}");
}
Recipe archiveRecipe = new Recipe(RecipeType.Chain, ArchiveItem, new() { }, items);
AddForwardRecipe(archiveRecipe);
}
private void PopulateDependencies(ItemDefinition item, HashSet<ItemDefinition> dependencies, bool excludeSpecialItems)
{
if (excludeSpecialItems && IsSpecialItem(item))
{
return;
}
if (dependencies.Contains(item))
{
return;
}
dependencies.Add(item);
var recipes = RecipeCatalog.GetRecipes(item);
foreach (var recipe in recipes)
{
foreach (var ingredient in recipe.ItemIngredients)
{
PopulateDependencies(ingredient, dependencies, excludeSpecialItems);
}
}
}
public HashSet<Recipe> GetRecipes(ItemDefinition item) => RecipeCatalog.GetRecipes(item);
public bool HasArchiveItem(ItemDefinition item) => PayloadCatalog.ArchiveItems.Contains(item);
public IEnumerable<ItemDefinition> ArchiveItems => PayloadCatalog.ArchiveItems;
private void ImportArchive(ILogger logger, ArchiveTokenization tokens)
{
var toArchive = ArchiveItem.GetSha256HashString();
var fromArchive = tokens.ArchiveItem.GetSha256HashString();
logger?.LogInformation("Importing archive {fromArchive} into {toArchive}", fromArchive, toArchive);
if (tokens.RecipeCatalog.HasAnyRecipes(tokens.ArchiveItem))
{
logger?.LogInformation("Imported archive has a recipe for the ArchiveItem.");
}
else
{
logger?.LogInformation("Imported archive does not have a recipe for the ArchiveItem.");
}
foreach (var recipe in tokens.Recipes)
{
RecipeCatalog.AddRecipe(recipe);
}
foreach (var recipeEntry in tokens.ForwardRecipes)
{
var result = recipeEntry.Key;
var recipe = recipeEntry.Value;
ForwardRecipes[result] = recipe;
}
foreach (var recipeEntry in tokens.ReverseRecipes)
{
var result = recipeEntry.Key;
var recipe = recipeEntry.Value;
ReverseRecipes[result] = recipe;
}
foreach (var (payload, items) in tokens.Payload)
{
foreach (var item in items)
{
PayloadCatalog.AddPayload(payload, item);
}
}
}
public bool TryExtractItems(ILogger logger, string archivePath, IEnumerable<ItemDefinition> toExtract)
{
try
{
ExtractItems(logger, archivePath, toExtract);
}
catch (Exception)
{
return false;
}
return true;
}
public void ExtractItems(ILogger logger, string archivePath, IEnumerable<ItemDefinition> toExtract)
{
using var createSession = new DiffApi.DiffcSession();
createSession.SetTarget(ArchiveItem);
var extractionRoot = ItemFolder;
int recipeCount = 0;
foreach (var entry in RecipeCatalog.Entries)
{
var recipes = entry.Value;
foreach (var recipe in recipes)
{
recipeCount++;
createSession.AddRecipe(recipe);
}
}
using var applySession = createSession.NewApplySession();
logger?.LogInformation("Trying to extract {toExtractCount:N0} items from {archivePath} using {recipeCount:N0} recipes.", toExtract.Count(), archivePath, recipeCount);
foreach (var item in toExtract)
{
var itemPath = item.GetExtractionPath(extractionRoot);
applySession.RequestItem(item);
}
applySession.AddItemToPantry(archivePath);
if (!applySession.ProcessRequestedItems())
{
var msg = $"Can't process items from: {archivePath}";
logger?.LogError(msg);
throw new Exception(msg);
}
applySession.ResumeSlicing();
Directory.CreateDirectory(extractionRoot);
foreach (var item in toExtract)
{
var hashString = item.GetSha256HashString();
if (hashString == null)
{
var msg = "Trying to extract an item without a sha256 hash.";
logger?.LogError(msg);
throw new Exception(msg);
}
var itemPath = item.GetExtractionPath(extractionRoot);
applySession.ExtractItemToPath(item, itemPath);
var fromFile = ItemDefinition.FromFile(itemPath);
}
applySession.CancelSlicing();
foreach (var item in toExtract)
{
var itemPath = item.GetExtractionPath(extractionRoot);
var fromFile = ItemDefinition.FromFile(itemPath);
if (!item.Equals(fromFile))
{
var msg = $"Extracted file {itemPath} mismatch from expected result. Expected: {item}, Actual: {fromFile}";
logger?.LogError(msg);
applySession.CancelSlicing();
throw new Exception(msg);
}
}
}
}
}