lib/backend/hdfsbackend/client.go (203 lines of code) (raw):
// Copyright (c) 2016-2019 Uber Technologies, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package hdfsbackend
import (
"errors"
"fmt"
"io"
"path"
"regexp"
"sync"
"github.com/uber-go/tally"
"github.com/uber/kraken/core"
"github.com/uber/kraken/lib/backend"
"github.com/uber/kraken/lib/backend/hdfsbackend/webhdfs"
"github.com/uber/kraken/lib/backend/namepath"
"github.com/uber/kraken/utils/httputil"
"github.com/uber/kraken/utils/log"
"github.com/satori/go.uuid"
"go.uber.org/zap"
"gopkg.in/yaml.v2"
)
const _hdfs = "hdfs"
func init() {
backend.Register(_hdfs, &factory{})
}
type factory struct{}
func (f *factory) Create(
confRaw interface{}, masterAuthConfig backend.AuthConfig, stats tally.Scope, _ *zap.SugaredLogger) (backend.Client, error) {
confBytes, err := yaml.Marshal(confRaw)
if err != nil {
return nil, errors.New("marshal hdfs config")
}
var config Config
if err := yaml.Unmarshal(confBytes, &config); err != nil {
return nil, errors.New("unmarshal hdfs config")
}
return NewClient(config, stats)
}
// Client is a backend.Client for HDFS.
type Client struct {
config Config
pather namepath.Pather
webhdfs webhdfs.Client
stats tally.Scope
}
// Option allows setting optional Client parameters.
type Option func(*Client)
// WithWebHDFS configures a Client with a custom webhdfs implementation.
func WithWebHDFS(w webhdfs.Client) Option {
return func(c *Client) { c.webhdfs = w }
}
// NewClient creates a new Client for HDFS.
func NewClient(config Config, stats tally.Scope, opts ...Option) (*Client, error) {
config.applyDefaults()
if !path.IsAbs(config.RootDirectory) {
return nil, errors.New("invalid config: root_directory must be absolute path")
}
pather, err := namepath.New(config.RootDirectory, config.NamePath)
if err != nil {
return nil, fmt.Errorf("namepath: %s", err)
}
webhdfs, err := webhdfs.NewClient(config.WebHDFS, config.NameNodes, config.UserName)
if err != nil {
return nil, err
}
client := &Client{config, pather, webhdfs, stats}
for _, opt := range opts {
opt(client)
}
return client, nil
}
// Stat returns blob info for name.
func (c *Client) Stat(namespace, name string) (*core.BlobInfo, error) {
path, err := c.pather.BlobPath(name)
if err != nil {
return nil, fmt.Errorf("blob path: %s", err)
}
fs, err := c.webhdfs.GetFileStatus(path)
if err != nil {
return nil, err
}
return core.NewBlobInfo(fs.Length), nil
}
// Download downloads name into dst.
func (c *Client) Download(namespace, name string, dst io.Writer) error {
path, err := c.pather.BlobPath(name)
if err != nil {
return fmt.Errorf("blob path: %s", err)
}
return c.webhdfs.Open(path, dst)
}
// Upload uploads src to name.
func (c *Client) Upload(namespace, name string, src io.Reader) error {
uploadPath := path.Join(c.config.RootDirectory, c.config.UploadDirectory, uuid.NewV4().String())
blobPath, err := c.pather.BlobPath(name)
if err != nil {
return fmt.Errorf("blob path: %s", err)
}
if err := c.webhdfs.Create(uploadPath, src); err != nil {
return err
}
if err := c.webhdfs.Mkdirs(path.Dir(blobPath)); err != nil {
return err
}
return c.webhdfs.Rename(uploadPath, blobPath)
}
var (
_ignoreRegex = regexp.MustCompile(
"^.+/repositories/.+/(_layers|_uploads|_manifests/(revisions|tags/.+/index)).*")
_stopRegex = regexp.MustCompile("^.+/repositories/.+/_manifests$")
)
type listResult struct {
dir string
list []webhdfs.FileStatus
err error
}
func (c *Client) lister(done <-chan struct{}, listJobs <-chan string, results chan<- listResult) {
for {
select {
case <-done:
return
case dir := <-listJobs:
l, err := c.webhdfs.ListFileStatus(dir)
select {
case <-done:
return
case results <- listResult{dir, l, err}:
}
}
}
}
func (c *Client) sendAll(done <-chan struct{}, dirs []string, listJobs chan<- string) {
for _, d := range dirs {
select {
case <-done:
return
case listJobs <- d:
}
}
}
// List lists names which start with prefix.
func (c *Client) List(prefix string, opts ...backend.ListOption) (*backend.ListResult, error) {
options := backend.DefaultListOptions()
for _, opt := range opts {
opt(options)
}
if options.Paginated {
return nil, errors.New("pagination not supported")
}
root := path.Join(c.pather.BasePath(), prefix)
listJobs := make(chan string)
results := make(chan listResult)
done := make(chan struct{})
var wg sync.WaitGroup
for i := 0; i < c.config.ListConcurrency; i++ {
wg.Add(1)
go func() {
c.lister(done, listJobs, results)
wg.Done()
}()
}
defer func() {
close(done)
if c.config.testing {
// Waiting might be delayed if an early error is encountered but
// other goroutines are waiting on a long http timeout. Thus, we
// only wait for each spawned goroutine to exit during testing to
// assert that no goroutines leak.
wg.Wait()
}
}()
var files []string
// Pending tracks the number of directories which are pending exploration.
// Invariant: there will be a result received for every increment made to
// pending.
pending := 1
listJobs <- root
for pending > 0 {
res := <-results
pending--
if res.err != nil {
if httputil.IsNotFound(res.err) {
continue
}
return nil, res.err
}
var dirs []string
for _, fs := range res.list {
p := path.Join(res.dir, fs.PathSuffix)
// TODO(codyg): This is an ugly hack to avoid walking through non-tags
// during Docker catalog. Ideally, only tags are located in the repositories
// directory, however in WBU2 HDFS, there are blobs here as well. At some
// point, we must migrate the data into a structure which cleanly divides
// blobs and tags (like we do in S3).
if _ignoreRegex.MatchString(p) {
continue
}
// TODO(codyg): Another ugly hack to speed up catalog performance by stopping
// early when we hit tags...
if _stopRegex.MatchString(p) {
p = path.Join(p, "tags/dummy/current/link")
fs.Type = "FILE"
}
if fs.Type == "DIRECTORY" {
// Flat directory structures are common, so accumulate directories and send
// them to the listers in a single goroutine (as opposed to a goroutine per
// directory).
dirs = append(dirs, p)
} else {
name, err := c.pather.NameFromBlobPath(p)
if err != nil {
log.With("path", p).Errorf("Error converting blob path into name: %s", err)
continue
}
files = append(files, name)
}
}
if len(dirs) > 0 {
// We cannot send list jobs and receive results in the same thread, else
// deadlock will occur.
wg.Add(1)
go func() {
c.sendAll(done, dirs, listJobs)
wg.Done()
}()
pending += len(dirs)
}
}
return &backend.ListResult{
Names: files,
}, nil
}