git-lfs/git_lfs.py (470 lines of code) (raw):
# -*- encoding:utf-8 -*-
# two config files are used: .git_bin_path .git_bin_url
import hashlib
import json
import logging
import os
import re
import subprocess
import sys
import traceback
blank_split = re.compile('[\t ]')
logging.basicConfig(
format='[%(levelname)s] %(asctime)s %(filename)s[%(lineno)d] : %(message)s',
level=logging.INFO)
try:
import oss2
except ImportError:
logging.error(
'please install python_oss from https://github.com/aliyun/aliyun-oss-python-sdk.git'
)
sys.exit(1)
git_bin_path = '.git_bin_path'
git_bin_url_path = '.git_bin_url'
# temporary storage path
git_oss_cache_dir = '.git_oss_cache'
# get project name by using git remote -v
def get_proj_name():
proj_name = subprocess.check_output(['git', 'remote', '-v'])
proj_name = proj_name.decode('utf-8')
proj_name = proj_name.split('\n')[0]
proj_name = blank_split.split(proj_name)[1]
proj_name = proj_name.split('/')[-1]
proj_name = proj_name.replace('.git', '')
return proj_name
# load .git_bin_url
# local_path md5 remote_path
def load_git_url():
git_bin_url_map = {}
try:
with open(git_bin_url_path) as fin:
for line_str in fin:
line_str = line_str.strip()
line_json = json.loads(line_str)
git_bin_url_map[line_json['leaf_path']] = (line_json['sig'],
line_json['remote_path'])
except Exception as ex:
logging.warning('exception: %s' % str(ex))
pass
return git_bin_url_map
def save_git_url(git_bin_url_map):
with open(git_bin_url_path, 'w') as fout:
keys = list(git_bin_url_map.keys())
keys.sort()
for key in keys:
val = git_bin_url_map[key]
tmp_str = '{"leaf_path": "%s", "sig": "%s", "remote_path": "%s"}' % (
key, val[0], val[1])
fout.write('%s\n' % tmp_str)
def path2name(path):
name = path.replace('//', '/')
name = path.replace('/', '_')
if name[-1] == '_':
return name[:-1]
elif name == '.':
return 'curr_dir'
else:
return name
def get_file_arr(path):
archive_files = []
if os.path.isdir(path):
for one_file in os.listdir(path):
one_path = path + '/' + one_file
if not os.path.isdir(one_path):
archive_files.append(one_path)
return archive_files
else: # just a file
archive_files.append(path)
return archive_files
def load_git_bin():
file_arr = {}
if not os.path.exists(git_bin_path):
return file_arr
with open(git_bin_path, 'r') as fin:
for line_str in fin:
line_str = line_str.strip()
try:
line_json = json.loads(line_str)
file_arr[line_json['leaf_name']] = line_json['leaf_file']
except Exception as ex:
logging.warning('%s is corrupted : %s' %
(git_bin_path, traceback.format_exc(ex)))
return file_arr
def save_git_bin(git_arr):
leaf_paths = list(git_arr.keys())
leaf_paths.sort()
with open(git_bin_path, 'w') as fout:
for leaf_path in leaf_paths:
leaf_files = git_arr[leaf_path]
leaf_files.sort()
# make sure that leaf_name is in front of leaf_file
tmp_str = '{"leaf_name": "%s", "leaf_file": %s}' % (
leaf_path, json.dumps(leaf_files))
fout.write('%s\n' % tmp_str)
def recheck_git_bin():
file_arr = load_git_bin()
update = False
del_arr = []
for leaf_path in file_arr:
leaf_files = file_arr[leaf_path]
good_leaf_files = [x for x in leaf_files if os.path.exists(x)]
if not os.path.exists(leaf_path):
del_arr.append(leaf_path)
update = True
elif len(good_leaf_files) != len(leaf_files):
file_arr[leaf_path] = good_leaf_files
update = True
for leaf_path in del_arr:
del file_arr[leaf_path]
if update:
save_git_bin(file_arr)
return file_arr
# check whether a folder changes by check md5 of the tar file of the folder
# note -z option is not used, because the file has random effects
# the md5files are saved in .git_bin_url
def get_local_sig(leaf_files):
if len(leaf_files) == 0:
logging.warning('no leaf files')
return None
leaf_files = sorted(leaf_files)
m = hashlib.md5()
block_size = 1024 * 1024 * 8
for one_file in leaf_files:
with open(one_file, 'rb') as fin:
for chunk in iter(lambda: fin.read(block_size), b''):
m.update(chunk)
return m.hexdigest()
def list_leafs(curr_path):
bottom_dir = []
if os.path.isdir(curr_path):
for root, dirs, files in os.walk(curr_path, topdown=True):
if len(dirs) == 0 or len(files) > 0:
if root[-1] == '/':
root = root[:-1]
file_arr = get_file_arr(root)
bottom_dir.append((root, file_arr))
else: # a single file
curr_dir = os.path.dirname(curr_path)
if curr_dir == '':
curr_dir = '.'
bottom_dir.append((curr_dir, [curr_path]))
return bottom_dir
# check whether lst0 and lst1 contain the same string elements
def lst_eq(lst0, lst1):
if len(lst0) != len(lst1):
return False
for x in lst1:
if x not in lst0:
return False
return True
def merge_lst(lst0, lst1):
for a in lst1:
if a not in lst0:
lst0.append(a)
return lst0
def has_conflict(leaf_path, leaf_files):
if not os.path.exists(leaf_path):
return False
for leaf_file in leaf_files:
if os.path.exists(leaf_file):
return True
return False
def get_yes_no(msg):
while True:
logging.info(msg)
tmp_op = sys.stdin.readline()
tmp_op = tmp_op.strip()
if len(tmp_op) == 0:
break
elif tmp_op[0] == 'Y' or tmp_op[0] == 'y':
update = True
break
elif tmp_op[0] == 'N' or tmp_op[0] == 'n':
update = False
break
return update
if __name__ == '__main__':
if len(sys.argv) < 2:
logging.error(
'usage: python git_lfs.py [pull] [push] [add filename] [resolve_conflict]'
)
sys.exit(1)
home_directory = os.path.expanduser("~")
with open('.git_oss_config_pub', 'r') as fin:
git_oss_data_dir = None
host = None
bucket_name = None
git_oss_private_path = None
enable_accelerate = 0
accl_endpoint = None
for line_str in fin:
line_str = line_str.strip()
if len(line_str) == 0:
continue
if line_str.startswith('#'):
continue
line_str = line_str.replace('~/', home_directory + '/')
line_str = line_str.replace('${TMPDIR}/',
os.environ.get('TMPDIR', '/tmp/'))
line_str = line_str.replace('${PROJECT_NAME}', get_proj_name())
line_tok = [x.strip() for x in line_str.split('=') if x != '']
if line_tok[0] == 'host':
host = line_tok[1]
elif line_tok[0] == 'git_oss_data_dir':
git_oss_data_dir = line_tok[1].strip('/')
elif line_tok[0] == 'bucket_name':
bucket_name = line_tok[1]
elif line_tok[0] == 'git_oss_private_config':
git_oss_private_path = line_tok[1]
if git_oss_private_path.startswith('~/'):
git_oss_private_path = os.path.join(home_directory,
git_oss_private_path[2:])
elif line_tok[0] == 'git_oss_cache_dir':
git_oss_cache_dir = line_tok[1]
elif line_tok[0] == 'accl_endpoint':
accl_endpoint = line_tok[1]
logging.info('git_oss_data_dir=%s, host=%s, bucket_name=%s' %
(git_oss_data_dir, host, bucket_name))
logging.info('git_oss_cache_dir: %s' % git_oss_cache_dir)
if not os.path.exists(git_oss_cache_dir):
os.makedirs(git_oss_cache_dir)
logging.info('git_oss_private_config=%s' % git_oss_private_path)
if git_oss_private_path is not None and os.path.exists(git_oss_private_path):
# load oss configs
with open(git_oss_private_path, 'r') as fin:
for line_str in fin:
line_str = line_str.strip()
line_tok = [x.strip() for x in line_str.split('=') if x != '']
if line_tok[0] in ['accessid', 'accessKeyID']:
accessid = line_tok[1]
elif line_tok[0] in ['accesskey', 'accessKeySecret']:
accesskey = line_tok[1]
oss_auth = oss2.Auth(accessid, accesskey)
oss_bucket = oss2.Bucket(oss_auth, host, bucket_name)
else:
logging.info('git_oss_private_path[%s] is not found, read-only mode' %
git_oss_private_path)
# pull only mode
oss_auth = None
oss_bucket = None
if sys.argv[1] == 'push':
updated = False
git_bin_arr = recheck_git_bin()
git_bin_url = load_git_url()
for leaf_path in git_bin_arr:
leaf_files = git_bin_arr[leaf_path]
# empty directory will not be push to oss
if len(leaf_files) == 0:
continue
file_name = path2name(leaf_path)
new_sig = get_local_sig(leaf_files)
if new_sig is None:
continue
if leaf_path in git_bin_url and git_bin_url[leaf_path][0] == new_sig:
continue
# build tar file and push to oss
file_name_with_sig = file_name + '_' + new_sig
tar_out_path = '%s/%s.tar.gz' % (git_oss_cache_dir, file_name_with_sig)
subprocess.check_output(['tar', '-czf', tar_out_path] + leaf_files)
save_path = '%s/%s' % (git_oss_data_dir, file_name_with_sig)
oss_bucket.put_object_from_file(save_path, tar_out_path)
oss_bucket.put_object_acl(save_path, oss2.OBJECT_ACL_PUBLIC_READ)
git_bin_url[leaf_path] = (new_sig, save_path)
logging.info('pushed %s' % leaf_path)
updated = True
for leaf_path in list(git_bin_url.keys()):
if leaf_path not in git_bin_arr:
del git_bin_url[leaf_path]
logging.info('dropped %s' % leaf_path)
updated = True
if updated:
save_git_url(git_bin_url)
logging.info('push succeed.')
else:
logging.warning('nothing to push')
subprocess.check_output(['git', 'add', git_bin_url_path])
elif sys.argv[1] == 'pull':
# pull images from remote
any_update = False
git_bin_arr = load_git_bin()
git_bin_url = load_git_url()
for leaf_path in git_bin_arr:
leaf_files = git_bin_arr[leaf_path]
if len(leaf_files) == 0:
if os.path.isfile(leaf_path):
logging.error('conflicts: %s is a file, but was a dir' % leaf_path)
elif not os.path.isdir(leaf_path):
os.makedirs(leaf_path)
continue
# newly add files
if leaf_path not in git_bin_url:
continue
file_name = path2name(leaf_path)
all_file_exist = True
for tmp in leaf_files:
if not os.path.exists(tmp):
all_file_exist = False
remote_sig = git_bin_url[leaf_path][0]
if all_file_exist:
local_sig = get_local_sig(leaf_files)
if local_sig == remote_sig:
continue
else:
local_sig = ''
update = False
if len(sys.argv) > 2 and (sys.argv[2] == '-f' or
sys.argv[2] == '--force'):
update = True
else:
if has_conflict(leaf_path, leaf_files):
update = get_yes_no(
'update %s using remote file[remote_sig=%s local_sig=%s]?[N/Y]' %
(leaf_path, remote_sig, local_sig))
else:
update = True
if not update:
continue
# pull from remote oss
remote_path = git_bin_url[leaf_path][1]
_, file_name_with_sig = os.path.split(remote_path)
tar_tmp_path = '%s/%s.tar.gz' % (git_oss_cache_dir, file_name_with_sig)
max_retry = 5
while max_retry > 0:
try:
if not os.path.exists(tar_tmp_path):
in_cache = False
if oss_bucket:
oss_bucket.get_object_to_file(remote_path, tar_tmp_path)
else:
url = 'http://%s.%s/%s' % (bucket_name, host, remote_path)
# subprocess.check_output(['wget', url, '-O', tar_tmp_path])
if sys.platform.startswith('linux'):
subprocess.check_output(['wget', url, '-O', tar_tmp_path])
elif sys.platform.startswith('darwin'):
subprocess.check_output(['curl', url, '--output', tar_tmp_path])
elif sys.platform.startswith('win'):
subprocess.check_output(['curl', url, '--output', tar_tmp_path])
else:
in_cache = True
logging.info('%s is in cache' % file_name_with_sig)
subprocess.check_output(['tar', '-zxf', tar_tmp_path])
local_sig = get_local_sig(leaf_files)
if local_sig == remote_sig:
break
if in_cache:
logging.warning('cache invalid, will download from remote')
os.remove(tar_tmp_path)
continue
logging.warning('download failed, local_sig(%s) != remote_sig(%s)' %
(local_sig, remote_sig))
except subprocess.CalledProcessError as ex:
logging.error('exception: %s' % str(ex))
except oss2.exceptions.RequestError as ex:
logging.error('exception: %s' % str(ex))
os.remove(tar_tmp_path)
if accl_endpoint is not None and host != accl_endpoint:
logging.info('will try accelerate endpoint: %s' % accl_endpoint)
host = accl_endpoint
if oss_auth:
oss_bucket = oss2.Bucket(oss_auth, host, bucket_name)
max_retry -= 1
logging.info('%s updated' % leaf_path)
any_update = True
if not any_update:
logging.info('nothing to be updated')
elif sys.argv[1] == 'add':
add_path = sys.argv[2]
if not os.path.exists(add_path):
raise ValueError('add path %s does not exist' % add_path)
bin_file_map = {}
try:
bin_file_map = load_git_bin()
except Exception as ex:
logging.warning('load_git_bin exception: %s' % traceback.format_exc(ex))
pass
leaf_dirs = list_leafs(add_path)
any_new = False
for leaf_path, leaf_files in leaf_dirs:
for leaf_file in leaf_files:
tmp_out = subprocess.check_output(['git', 'ls-files', leaf_file])
if len(tmp_out.strip()) > 0:
subprocess.check_output(['git', 'rm', '--cached', leaf_file])
if leaf_path not in bin_file_map:
bin_file_map[leaf_path] = leaf_files
any_new = True
else: # check whether the files are the same
old_leaf_files = bin_file_map[leaf_path]
if not lst_eq(old_leaf_files, leaf_files):
bin_file_map[leaf_path] = merge_lst(old_leaf_files, leaf_files)
any_new = True
if any_new:
# write back to .git_bin_path
save_git_bin(bin_file_map)
logging.info('added %s' % add_path)
else:
logging.info('already add %s' % add_path)
subprocess.check_output(['git', 'add', '.git_bin_path'])
elif sys.argv[1] == 'remove':
del_path = sys.argv[2]
try:
bin_file_map = load_git_bin()
except Exception as ex:
logging.warning('load_git_bin exception: %s' % traceback.format_exc(ex))
pass
leaf_dirs = list_leafs(del_path)
any_update = False
for leaf_path, leaf_files in leaf_dirs:
if leaf_path in bin_file_map:
for leaf_file in leaf_files:
if leaf_file in bin_file_map[leaf_path]:
tmp_id = bin_file_map[leaf_path].index(leaf_file)
del bin_file_map[leaf_path][tmp_id]
any_update = True
if len(bin_file_map[leaf_path]) == 0:
del bin_file_map[leaf_path]
if any_update:
save_git_bin(bin_file_map)
logging.info('remove %s' % del_path)
elif sys.argv[1] == 'resolve_conflict':
git_objs = {}
with open(git_bin_path, 'r') as fin:
merge_start = 0
for line_str in fin:
if line_str.startswith('<<<<<<<'):
merge_start = 1
elif line_str.startswith('======='):
merge_start = 2
elif line_str.startswith('>>>>>>>'):
merge_start = 0
elif merge_start == 0:
tmp_obj = json.loads(line_str)
leaf_name = tmp_obj['leaf_name']
leaf_file = tmp_obj['leaf_file']
git_objs[leaf_name] = leaf_file
elif merge_start == 1:
tmp_obj = json.loads(line_str)
leaf_name = tmp_obj['leaf_name']
leaf_file = tmp_obj['leaf_file']
git_objs[leaf_name] = leaf_file
elif merge_start == 2:
tmp_obj = json.loads(line_str)
leaf_name = tmp_obj['leaf_name']
leaf_file = tmp_obj['leaf_file']
if leaf_name in git_objs:
union = git_objs[leaf_name]
for tmp in leaf_file:
if tmp not in union:
union.append(tmp)
logging.info('add %s to %s' % (tmp, leaf_name))
git_objs[leaf_name] = union
else:
git_objs[leaf_name] = leaf_file
else:
logging.warning('invalid state: merge_start = %d, line_str = %s' %
(merge_start, line_str))
save_git_bin(git_objs)
git_bin_url_map = {}
with open(git_bin_url_path, 'r') as fin:
merge_start = 0
for line_str in fin:
if line_str.startswith('<<<<<<<'):
merge_start = 1
elif line_str.startswith('======='):
merge_start = 2
elif line_str.startswith('>>>>>>>'):
merge_start = 0
elif merge_start in [0, 1, 2]:
line_json = json.loads(line_str)
if line_json['leaf_path'] in git_objs:
git_bin_url_map[line_json['leaf_path']] = (line_json['sig'],
line_json['remote_path'])
else:
logging.warning('invalid state: merge_start = %d, line_str = %s' %
(merge_start, line_str))
save_git_url(git_bin_url_map)
logging.info('all conflicts fixed.')
else:
logging.warning('invalid cmd: %s' % sys.argv[1])
logging.warning(
'choices are: %s' %
','.join(['push', 'pull', 'add', 'remove', 'resolve_conflict']))