alloc.go (607 lines of code) (raw):

// Licensed to Elasticsearch B.V. under one or more contributor // license agreements. See the NOTICE file distributed with // this work for additional information regarding copyright // ownership. Elasticsearch B.V. licenses this file to you under // the Apache License, Version 2.0 (the "License"); you may // not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, // software distributed under the License is distributed on an // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. package txfile import ( "fmt" "math" "github.com/elastic/go-txfile/internal/invariant" ) // file global allocator state type ( // allocator manages the on-disk page allocation. Pages in the allocator can // be either part of the meta-area or data-area. Users allocate pages from // the data-area only. The meta-area keeps pages available for in file // meta-data like overwrite pages and freelists. The meta-area allocates // pages from the data-area, if required. The meta-area grows by always doubling // the amount of pages in the meta-area. // For allocations one must get an instance to the dataAllocator, // walAllocator or metaAllocator respectively. Each allocator provides // slightly different allocation strategies. // The walAllocator is used for contents overwrite pages, while the // metaAllocator is used to allocate pages for for serializing the overwrite // mapping and freelispages for for serializing the overwrite mapping and // freelist. allocator struct { // configuration maxPages uint maxSize uint pageSize uint // meta area meta allocArea metaTotal uint // total number of pages reserved by meta area // data area data allocArea // allocator file metadata freelistRoot PageID freelistPages regionList // page ids used to store the free list } allocArea struct { endMarker PageID freelist freelist } // custom allocator implementations, sharing the global allocator state dataAllocator allocator // allocate from data area walAllocator allocator // allocate WAL overwrite pages from beginning of meta area metaAllocator allocator // allocate meta pages from end of meta area // metaManager manages the data and meta regions, by moving regions // between those areas. The manager is used by walAllocator and metaAllocator // only. metaManager allocator ) //transaction local allocation state type ( // txAllocState is used by write transactions, to record changes to the file // allocation state. The file global allocator state is modified within the // write transaction. txAllocState acts as undo/redo-log for the in-memory // allocation state. // Page frees are only recorded within the transaction. No pages are returned // to the allocator, so to ensure a page freed can not be allocated. This // guarantees freed pages can not be overwritten in the current transaction // (keep most recent transaction intact). txAllocState struct { manager txAreaManageState data txAllocArea meta txAllocArea options txAllocOptions // per transaction allocation options stats txAllocStats } txAllocArea struct { endMarker PageID allocated pageSet // allocated pages from freelist new pageSet // allocated pages from end of file freed pageSet // set of pages freed within transaction } txAreaManageState struct { moveToMeta regionList // list regions moved from data area to meta area } // txAllocOptions keeps track of user options passed to the transaction. txAllocOptions struct { overflowAreaEnabled bool // enable allocating pages with ID > maxPages for metadata metaGrowPercentage int // limit of meta area in use, so to allocate new pages into the meta area } txAllocStats struct { data txAllocAreaStats meta txAllocAreaStats overflow txAllocAreaStats // overflow region allocations/frees toMeta uint // number of pages moved from data area to meta area } txAllocAreaStats struct { alloc, freed uint } ) // allocCommitState keeps track of the new allocator state during the commit. // These changes must be recorded for now, as the final allocator state must // not be updated in memory until after the transaction has been commited to // the file. type allocCommitState struct { tx *txAllocState updated bool // set if updates to allocator within current transaction allocRegions regionList // meta pages allocated to write new freelist too dataEndMarker PageID // new data area end marker after cleaning the data free list metaEndMarker PageID // new meta area end marker after cleaning the data free list metaList regionList // new meta area freelist dataList regionList // new data area freelist dataFreed uint // number of pages in the data region removed from the free list overflowFreed uint // number of pages in overflow region removed from the free list } // noLimit indicates the data/meta-area can grow without any limits. const noLimit uint = maxUint const defaultMetaGrowPercentage = 80 // allocator // --------- func (a *allocator) DataAllocator() *dataAllocator { return (*dataAllocator)(a) } func (a *allocator) WALPageAllocator() *walAllocator { return (*walAllocator)(a) } func (a *allocator) MetaAllocator() *metaAllocator { return (*metaAllocator)(a) } func (a *allocator) metaManager() *metaManager { return (*metaManager)(a) } func (a *allocator) makeTxAllocState(withOverflow bool, growPercentage int) txAllocState { if growPercentage <= 0 { growPercentage = defaultMetaGrowPercentage } return txAllocState{ data: txAllocArea{ endMarker: a.data.endMarker, }, meta: txAllocArea{ endMarker: a.meta.endMarker, }, options: txAllocOptions{ overflowAreaEnabled: withOverflow, metaGrowPercentage: growPercentage, }, } } func (a *allocator) fileCommitPrepare( st *allocCommitState, tx *txAllocState, forceUpdate bool, ) { st.tx = tx st.updated = forceUpdate || tx.Updated() } func (a *allocator) fileCommitAlloc(st *allocCommitState) reason { const op = "txfile/commit-alloc-meta" if !st.updated { return nil } dataFreed := st.tx.data.freed.Regions() metaFreed := st.tx.meta.freed.Regions() // Predict number of meta pages required to store new freelist, // by iterating all region entries and take the potential encoding size // into account. As allocation might force a region from the data area // being moved (or split) into the meta area, we add more dummy region // with enforced max size. So the allocator can move pages between // meta and data if required. // This method over-estimates the number of required pages, as // we will have to allocate pages from the metaFree lists end // after the estimator finishes. prediction := prepareFreelistEncPagePrediction(freePageHeaderSize, a.pageSize) prediction.AddRegions(dataFreed) prediction.AddRegions(metaFreed) prediction.AddRegions(a.data.freelist.regions) prediction.AddRegions(a.meta.freelist.regions) if prediction.count > 0 { // only add extra pages if we need to write the meta page prediction.AddRegion(region{id: 1, count: math.MaxUint32}) prediction.AddRegion(region{id: 1, count: math.MaxUint32}) } // alloc regions for writing the new freelist var allocRegions regionList if n := prediction.count; n > 0 { allocRegions = a.MetaAllocator().AllocRegions(st.tx, n) if allocRegions == nil { return a.err(op).of(OutOfMemory). report("not enough space to allocate freelist meta pages") } } // Compute new freelist. As consecutive regions are merged the // resulting list might require less pages newDataList := mergeRegionLists(a.data.freelist.regions, dataFreed) newMetaList := mergeRegionLists(a.meta.freelist.regions, metaFreed) st.allocRegions = allocRegions dataEndMarker := a.data.endMarker metaEndMarker := a.meta.endMarker // Remove pages from end of overflow area from meta freelist + adjust end marker st.metaList, st.overflowFreed = releaseOverflowPages(newMetaList, a.maxPages, metaEndMarker) if st.overflowFreed > 0 { st.tx.stats.overflow.freed += st.overflowFreed newEnd := metaEndMarker - PageID(st.overflowFreed) if metaEndMarker > dataEndMarker { // shrink overflow area, which was allocated from data area dataEndMarker = newEnd } metaEndMarker = newEnd } // Remove pages from end of data area. Pages are removed from the data area // only if the file size has been decreased. st.dataList, st.dataFreed = releaseOverflowPages(newDataList, a.maxPages, dataEndMarker) if st.dataFreed > 0 { dataEndMarker -= PageID(st.dataFreed) if metaEndMarker >= dataEndMarker { metaEndMarker = dataEndMarker } } // Update new allocator end markers if regions have been removed from the free lists. st.dataEndMarker = dataEndMarker st.metaEndMarker = metaEndMarker return nil } // releaseOverflowPages removes pages at the end of a region list as long as // the current end marker is bigger then the maximum number of allowed pages // and the freelist contains some continuous regions up to endMarker. func releaseOverflowPages( list regionList, maxPages uint, endMarker PageID, ) (regionList, uint) { overflowStart, overflowEnd := PageID(maxPages), endMarker if maxPages == 0 || overflowStart >= overflowEnd { return list, 0 } var freed uint for i := len(list) - 1; i != -1; i-- { start, end := list[i].Range() if end < overflowEnd { break } if start < overflowStart { // split list[i].count = uint32(overflowStart - start) freed += uint(end - overflowStart) overflowEnd = overflowStart } else { // remove range overflowEnd = start freed += uint(list[i].count) list = list[:i] } } if len(list) == 0 { list = nil } return list, freed } func (a *allocator) fileCommitSerialize( st *allocCommitState, onPage func(id PageID, buf []byte) reason, ) reason { const op = "txfile/commit-serialize-alloc" if !st.updated || len(st.allocRegions) == 0 { return nil } err := writeFreeLists(st.allocRegions, a.pageSize, st.metaList, st.dataList, onPage) if err != nil { return a.errWrap(op, err).report("failed to serialize allocator state") } return nil } func (a *allocator) fileCommitMeta(meta *metaPage, st *allocCommitState) { if st.updated { var freelistRoot PageID if len(st.allocRegions) > 0 { freelistRoot = st.allocRegions[0].id } meta.freelist.Set(freelistRoot) meta.dataEndMarker.Set(st.dataEndMarker) meta.metaEndMarker.Set(st.metaEndMarker) meta.metaTotal.Set(uint64(a.metaTotal - st.overflowFreed)) } } func (a *allocator) Commit(st *allocCommitState) { if st.updated { a.freelistPages = st.allocRegions if len(st.allocRegions) > 0 { a.freelistRoot = st.allocRegions[0].id } else { a.freelistRoot = 0 } a.data.commit(st.dataEndMarker, st.dataList) a.meta.commit(st.metaEndMarker, st.metaList) a.metaTotal -= st.overflowFreed } } func (a *allocator) Rollback(st *txAllocState) { // restore meta area a.meta.rollback(&st.meta) for _, reg := range st.manager.moveToMeta { a.meta.freelist.RemoveRegion(reg) a.metaTotal -= uint(reg.count) if reg.id < st.data.endMarker { reg.EachPage(st.data.allocated.Add) } } // restore data area a.data.rollback(&st.data) } func (a *allocator) err(op string) *Error { return &Error{op: op} } func (a *allocator) errWrap(op string, err error) *Error { return a.err(op).causedBy(err) } func (a *allocArea) commit(endMarker PageID, regions regionList) { a.endMarker = endMarker a.freelist.regions = regions a.freelist.avail = regions.CountPages() } func (a *allocArea) rollback(st *txAllocArea) { for id := range st.allocated { if id >= st.endMarker { delete(st.allocated, id) } } a.freelist.AddRegions(st.allocated.Regions()) a.endMarker = st.endMarker } // metaManager // ----------- func (mm *metaManager) onGrow(st *txAllocState, n uint, overflow bool) { if overflow { st.stats.overflow.alloc += n } st.stats.toMeta += n } func (mm *metaManager) onAlloc(st *txAllocState, n uint) { st.stats.meta.alloc++ } func (mm *metaManager) onFree(st *txAllocState, n uint) { st.stats.meta.freed++ } func (mm *metaManager) DataAllocator() *dataAllocator { return (*dataAllocator)(mm) } func (mm *metaManager) Avail(st *txAllocState) uint { dataAvail := mm.DataAllocator().Avail(st) if dataAvail == noLimit || st.options.overflowAreaEnabled { return noLimit } return mm.meta.freelist.Avail() + dataAvail } func (mm *metaManager) Ensure(st *txAllocState, n uint) bool { total := mm.metaTotal avail := mm.meta.freelist.Avail() used := total - avail targetUsed := used + n invariant.Check(total >= avail, "invalid meta total page count") tracef("ensure(%v): total=%v, avail=%v, used=%v, targetUsed=%v\n", n, total, avail, used, targetUsed) pctGrow := st.options.metaGrowPercentage pctShrink := pctGrow / 2 szMinMeta, szMaxMeta := metaAreaTargetQuota(total, targetUsed, pctShrink, pctGrow) traceln(" target quota: ", szMinMeta, szMaxMeta) invariant.Check(szMaxMeta >= szMinMeta, "required page count must grow") if szMaxMeta == total { // we still have enough memory in the meta area -> return success // TODO: allow 'ensure' to shrink the meta area return true } invariant.Check(szMaxMeta > total, "expected new page count exceeding allocated pages") // try to move regions from data area into the meta area: requiredMax := szMaxMeta - total if mm.tryGrow(st, requiredMax, false) { return true } // Can not grow until 'requiredMax' -> try to grow up to requiredMin, // potentially allocating pages from the overflow area requiredMin := szMinMeta - total // returns false if we are out of memory return mm.tryGrow(st, requiredMin, st.options.overflowAreaEnabled) } func (mm *metaManager) tryGrow( st *txAllocState, count uint, withOverflow bool, ) bool { da := mm.DataAllocator() avail := da.Avail(st) tracef("try grow meta area pages=%v, avail=%v\n", count, avail) if count == 0 { return true } if avail < count { if !withOverflow { traceln("can not grow meta area yet") return false } da.AllocRegionsWith(st, avail, func(reg region) { mm.transferToMeta(st, reg) }) // allocate from overflow area required := count - avail if required > 0 { traceln("try to grow overflow area") } allocFromArea(&st.meta, &mm.meta.endMarker, required, func(reg region) { // st.manager.fromOverflow.Add(reg) n := uint(reg.count) mm.onGrow(st, n, true) mm.metaTotal += n mm.meta.freelist.AddRegion(reg) }) if mm.maxPages == 0 && mm.data.endMarker < mm.meta.endMarker { mm.data.endMarker = mm.meta.endMarker } return true } // Enough memory available in data area. Try to allocate continuous region first reg := da.AllocContinuousRegion(st, count) if reg.id != 0 { mm.transferToMeta(st, reg) return true } // no continuous memory block -> allocate single regions n := da.AllocRegionsWith(st, count, func(reg region) { mm.transferToMeta(st, reg) }) return n == count } func (mm *metaManager) transferToMeta(st *txAllocState, reg region) { n := uint(reg.count) st.manager.moveToMeta.Add(reg) mm.onGrow(st, n, false) mm.metaTotal += uint(reg.count) mm.meta.freelist.AddRegion(reg) } func (mm *metaManager) Free(st *txAllocState, id PageID) { // mark page as freed for now mm.onFree(st, 1) st.meta.freed.Add(id) } func metaAreaTargetQuota( total, used uint, shrinkPercentage, growPercentage int, ) (min, max uint) { min = used max = uint(nextPowerOf2(uint64(used))) if max < total { max = total } usage := 100 * float64(used) / float64(max) // grow 'max' by next power of 2, if used area would exceed growPercentage needsGrow := usage > float64(growPercentage) // If memory is to be freed (max < total), still grow 'max' by next power of // 2 (so not to free too much memory at once), if used area in new meta area // would exceed shrinkPercentage. // => percentage of used area in new meta area will be shrinkPercentage/2 needsGrow = needsGrow || (max < total && usage > float64(shrinkPercentage)) if min < total { min = total } if needsGrow { max = max * 2 } return min, max } // dataAllocator // ------------- func (a *dataAllocator) Avail(_ *txAllocState) uint { if a.maxPages == 0 { return noLimit } avail := a.data.freelist.Avail() if end := uint(a.data.endMarker); end < a.maxPages { avail += a.maxPages - end } return avail } func (a *dataAllocator) onAlloc(st *txAllocState, n uint) { st.stats.data.alloc += n } func (a *dataAllocator) onFree(st *txAllocState, n uint) { st.stats.data.freed += n } func (a *dataAllocator) AllocContinuousRegion( st *txAllocState, n uint, ) region { avail := a.Avail(st) if avail < n { return region{} } reg := allocContFromFreelist(&a.data.freelist, &st.data, allocFromBeginning, n) if reg.id != 0 { a.onAlloc(st, n) return reg } avail = a.maxPages - uint(a.data.endMarker) if avail < n { // out of memory return region{} } allocFromArea(&st.data, &a.data.endMarker, n, func(r region) { reg = r }) if a.meta.endMarker < a.data.endMarker { a.meta.endMarker = a.data.endMarker } a.onAlloc(st, n) return reg } func (a *dataAllocator) AllocRegionsWith( st *txAllocState, n uint, fn func(region), ) uint { avail := a.Avail(st) if avail < n { return 0 } // Enough space available -> allocate all pages. count := n // 1. allocate subset of regions from freelist n -= allocFromFreelist(&a.data.freelist, &st.data, allocFromBeginning, n, fn) if n > 0 { // 2. allocate from yet unused data area allocFromArea(&st.data, &a.data.endMarker, n, fn) if a.meta.endMarker < a.data.endMarker { a.meta.endMarker = a.data.endMarker } } a.onAlloc(st, count) return count } func (a *dataAllocator) Free(st *txAllocState, id PageID) { traceln("free page:", id) if id < 2 || id >= a.data.endMarker { panic(fmt.Sprintf("freed page ID %v out of bounds", id)) } a.onFree(st, 1) if !st.data.new.Has(id) { // fast-path, page has not been allocated in current transaction st.data.freed.Add(id) return } // page has been allocated in current transaction -> return to allocator for immediate re-use a.data.freelist.AddRegion(region{id: id, count: 1}) if st.data.endMarker >= id { // allocation from within old data region return } // allocation was from past the old end-marker. Check if we can shrink the // end marker again regions := a.data.freelist.regions last := len(regions) - 1 start, end := regions[last].Range() if end < a.data.endMarker { // in middle of new data region -> can not adjust end marker -> keep update to freelist return } if st.data.endMarker > start { start = st.data.endMarker count := uint(end - start) regions[last].count -= uint32(count) a.data.freelist.avail -= count } else { a.data.freelist.avail -= uint(regions[last].count) a.data.freelist.regions = regions[:last] } a.data.endMarker = start } // walAllocator // ------------ func (a *walAllocator) metaManager() *metaManager { return (*metaManager)(a) } func (a *walAllocator) Avail(st *txAllocState) uint { return a.metaManager().Avail(st) } func (a *walAllocator) Alloc(st *txAllocState) PageID { mm := a.metaManager() if !mm.Ensure(st, 1) { return 0 } // Use AllocContinuousRegion to find smallest fitting region // to allocate from. reg := a.meta.freelist.AllocContinuousRegion(allocFromBeginning, 1) if reg.id == 0 { return 0 } mm.onAlloc(st, 1) st.meta.allocated.Add(reg.id) return reg.id } func (a *walAllocator) AllocRegionsWith(st *txAllocState, n uint, fn func(region)) uint { mm := a.metaManager() if !mm.Ensure(st, n) { return 0 } count := allocFromFreelist(&a.meta.freelist, &st.meta, allocFromBeginning, n, fn) mm.onAlloc(st, count) return count } func (a *walAllocator) Free(st *txAllocState, id PageID) { a.metaManager().Free(st, id) } // metaAllocator // ------------ func (a *metaAllocator) metaManager() *metaManager { return (*metaManager)(a) } func (a *metaAllocator) Avail(st *txAllocState) uint { return a.metaManager().Avail(st) } func (a *metaAllocator) AllocRegionsWith( st *txAllocState, n uint, fn func(region), ) uint { mm := a.metaManager() if !mm.Ensure(st, n) { return 0 } count := allocFromFreelist(&a.meta.freelist, &st.meta, allocFromEnd, n, fn) mm.onAlloc(st, count) return count } func (a *metaAllocator) AllocRegions(st *txAllocState, n uint) regionList { reg := make(regionList, 0, n) if n := a.AllocRegionsWith(st, n, reg.Add); n == 0 { return nil } return reg } func (a *metaAllocator) Free(st *txAllocState, id PageID) { a.metaManager().Free(st, id) } func (a *metaAllocator) FreeAll(st *txAllocState, ids idList) { for _, id := range ids { a.Free(st, id) } } func (a *metaAllocator) FreeRegions(st *txAllocState, regions regionList) { regions.EachPage(func(id PageID) { a.Free(st, id) }) } // tx allocation state methods // --------------------------- func (s *txAllocState) Updated() bool { return s.meta.Updated() || s.data.Updated() } func (s *txAllocArea) Updated() bool { return !s.allocated.Empty() || !s.new.Empty() || !s.freed.Empty() } // allocator state (de-)serialization // ---------------------------------- func readAllocatorState(a *allocator, f *File, meta *metaPage, opts Options) reason { if a.maxSize > 0 { a.maxPages = a.maxSize / a.pageSize } a.data.endMarker = meta.dataEndMarker.Get() a.meta.endMarker = meta.metaEndMarker.Get() a.metaTotal = uint(meta.metaTotal.Get()) a.freelistRoot = meta.freelist.Get() if a.freelistRoot == 0 { return nil } var metaList, dataList freelist ids, err := readFreeList(f.mmapedPage, a.freelistRoot, func(isMeta bool, region region) { lst := &dataList if isMeta { lst = &metaList } lst.avail += uint(region.count) lst.regions.Add(region) }) if err != nil { return err } dataList.regions.Sort() dataList.regions.MergeAdjacent() metaList.regions.Sort() metaList.regions.MergeAdjacent() a.data.freelist = dataList a.meta.freelist = metaList a.freelistPages = ids.Regions() return nil } // allocator helpers/utilities // --------------------------- // allocFromFreelist allocates up to 'max' pages from the free list. // The number of allocated pages is returned func allocFromFreelist( f *freelist, area *txAllocArea, order *allocOrder, max uint, fn func(region), ) uint { count := max if f.avail < count { count = f.avail } f.AllocRegionsWith(order, count, func(region region) { region.EachPage(area.allocated.Add) fn(region) }) return count } func allocContFromFreelist( f *freelist, area *txAllocArea, order *allocOrder, n uint, ) region { region := f.AllocContinuousRegion(order, n) if region.id != 0 { region.EachPage(area.new.Add) } return region } func allocFromArea(area *txAllocArea, marker *PageID, count uint, fn func(region)) { // region can be max 2<<32 -> allocate in loop id := *marker for count > 0 { n := count if n > math.MaxUint32 { n = math.MaxUint32 } region := region{id: id, count: uint32(n)} region.EachPage(area.new.Add) fn(region) id += PageID(n) count -= n } *marker = id }