scripts/extract_source.rb (403 lines of code) (raw):
# rubocop:disable all
# You need to call this with the PostgreSQL source directory as the first commandline agument, and the output dir as the second
# ./scripts/extract_source.rb ./tmp/postgres ./src/postgres
require 'ffi/clang'
require 'json'
module FFI::Clang::Lib
enum :storage_class, [
:invalid, 0,
:none, 1,
:extern, 2,
:static, 3,
:private_extern, 4,
:opencl_workgroup_local, 5,
:auto, 6,
:register, 7,
]
attach_function :get_storage_class, :clang_Cursor_getStorageClass, [FFI::Clang::Lib::CXCursor.by_value], :storage_class
end
module FFI::Clang
class Cursor
def storage_class
Lib.get_storage_class(@cursor)
end
# Copy of clang::VarDecl::hasExternalStorage http://clang.llvm.org/doxygen/Decl_8h_source.html#l00982
def has_external_storage
storage_class == :extern || storage_class == :private_extern
end
end
end
class Runner
attr_reader :unresolved
attr_reader :code_for_resolve
def initialize
@file_analysis = {}
@global_method_to_base_filename = {}
@file_to_method_and_pos = {}
@external_variables = []
@resolved_static_by_base_filename = {}
@resolved_global = []
@symbols_to_output = {}
@include_files_to_output = []
@unresolved = []
@blacklist = []
@mock = {}
@basepath = File.absolute_path(ARGV[0]) + '/'
@out_path = File.absolute_path(ARGV[1]) + '/'
end
def blacklist(symbol)
@blacklist << symbol
end
def mock(symbol, code)
@mock[symbol] = code
end
def run
files = Dir.glob(@basepath + 'src/backend/**/*.c') +
Dir.glob(@basepath + 'src/common/**/*.c') +
Dir.glob(@basepath + 'src/port/**/*.c') +
Dir.glob(@basepath + 'src/timezone/**/*.c') +
Dir.glob(@basepath + 'src/pl/plpgsql/src/*.c') +
Dir.glob(@basepath + 'contrib/pgcrypto/*.c') -
[ # Blacklist
@basepath + 'src/backend/libpq/be-secure-openssl.c', # OpenSSL include error
@basepath + 'src/backend/utils/adt/levenshtein.c', # Built through varlena.c
@basepath + 'src/backend/utils/adt/like_match.c', # Built through like.c
@basepath + 'src/backend/utils/misc/guc-file.c', # Built through guc.c
@basepath + 'src/backend/utils/sort/qsort_tuple.c', # Built through tuplesort.c
@basepath + 'src/backend/bootstrap/bootscanner.c', # Built through bootparse.c
@basepath + 'src/backend/regex/regc_color.c', # Built through regcomp.c
@basepath + 'src/backend/regex/regc_cvec.c', # Built through regcomp.c
@basepath + 'src/backend/regex/regc_lex.c', # Built through regcomp.c
@basepath + 'src/backend/regex/regc_pg_locale.c', # Built through regcomp.c
@basepath + 'src/backend/regex/regc_locale.c', # Built through regcomp.c
@basepath + 'src/backend/regex/regc_nfa.c', # Built through regcomp.c
@basepath + 'src/backend/regex/rege_dfa.c', # Built through regexec.c
@basepath + 'src/backend/replication/repl_scanner.c', # Built through repl_gram.c
@basepath + 'src/backend/replication/libpqwalreceiver/libpqwalreceiver.c',
@basepath + 'src/backend/replication/syncrep_scanner.c',
@basepath + 'src/backend/port/posix_sema.c', # Linux only
@basepath + 'src/common/fe_memutils.c', # This file is not expected to be compiled for backend code
@basepath + 'src/common/restricted_token.c', # This file is not expected to be compiled for backend code
@basepath + 'src/common/unicode/norm_test.c', # This file is not expected to be compiled for backend code
@basepath + 'src/port/dirent.c', # Win32 only
@basepath + 'src/port/getaddrinfo.c', # Win32 only
@basepath + 'src/port/getrusage.c', # Win32 only
@basepath + 'src/port/gettimeofday.c', # Win32 only
@basepath + 'src/port/strerror.c', # Win32 only
@basepath + 'src/port/strerror.c', # Win32 only
@basepath + 'src/port/strlcat.c', # Win32 only
@basepath + 'src/port/strlcpy.c', # Win32 only
@basepath + 'src/port/unsetenv.c', # Win32 only
@basepath + 'src/port/win32error.c', # Win32 only
@basepath + 'src/port/win32env.c', # Win32 only
@basepath + 'src/port/win32security.c' # Win32 only
] -
Dir.glob(@basepath + 'src/backend/port/dynloader/*.c') -
Dir.glob(@basepath + 'src/backend/port/win32/*.c') -
Dir.glob(@basepath + 'src/backend/port/win32_*.c') -
Dir.glob(@basepath + 'src/backend/snowball/**/*.c')
#files = [@basepath + 'src/backend/parser/keywords.c']
files.each do |file|
if files == [file]
puts format('Analysing single file: %s', file)
analysis = analyze_file(file)
analysis_file = analysis.save
puts format('Result: %s', analysis_file)
exit 1
end
print '.'
analysis = FileAnalysis.restore(file, @basepath) || analyze_file(file)
analysis.save
@file_analysis[file] = analysis
analysis.symbol_to_file.each do |symbol, _|
next if analysis.static_symbols.include?(symbol)
if @global_method_to_base_filename[symbol] && !['main', 'Pg_magic_func', 'pg_open_tzfile', '_PG_init'].include?(symbol) && !@global_method_to_base_filename[symbol].end_with?('c')
puts format('Error processing %s, symbol %s already defined by %s', file, symbol, @global_method_to_base_filename[symbol])
end
@global_method_to_base_filename[symbol] = file
end
analysis.file_to_symbol_positions.each do |file, method_and_pos|
@file_to_method_and_pos[file] = method_and_pos
end
analysis.external_variables.each do |symbol|
@external_variables << symbol
end
end
#puts @caller_to_static_callees['/Users/lfittl/Code/libpg_query/postgres/src/backend/regex/regc_locale.c']['cclass'].inspect
puts "\nFinished parsing"
end
class FileAnalysis
attr_accessor :references, :static_symbols, :symbol_to_file, :file_to_symbol_positions, :external_variables, :included_files
def initialize(filename, basepath, references = {}, static_symbols = [],
symbol_to_file = {}, file_to_symbol_positions = {}, external_variables = [],
included_files = [])
@filename = filename
@basepath = basepath
@references = references
@static_symbols = static_symbols
@symbol_to_file = symbol_to_file
@file_to_symbol_positions = file_to_symbol_positions
@external_variables = external_variables
@included_files = included_files
end
def save
json = JSON.pretty_generate({
references: @references,
static_symbols: @static_symbols,
symbol_to_file: @symbol_to_file,
file_to_symbol_positions: @file_to_symbol_positions,
external_variables: @external_variables,
included_files: @included_files,
})
file = self.class.analysis_filename(@filename, @basepath)
FileUtils.mkdir_p(File.dirname(file))
File.write(file, json)
file
end
def self.restore(filename, basepath)
json = File.read(analysis_filename(filename, basepath))
hsh = JSON.parse(json)
new(filename, basepath, hsh['references'], hsh['static_symbols'],
hsh['symbol_to_file'], hsh['file_to_symbol_positions'], hsh['external_variables'],
hsh['included_files'])
rescue Errno::ENOENT
nil
end
private
def self.analysis_filename(filename, basepath)
File.absolute_path('./tmp/analysis') + '/' + filename.gsub(%r{^#{basepath}}, '').gsub(/.c$/, '.json')
end
end
def analyze_file(file)
index = FFI::Clang::Index.new(true, true)
translation_unit = index.parse_translation_unit(file, ['-I', @basepath + 'src/include', '-I', '/usr/local/opt/openssl/include', '-DDLSUFFIX=".bundle"', '-msse4.2', '-g'])
cursor = translation_unit.cursor
func_cursor = nil
analysis = FileAnalysis.new(file, @basepath)
included_files = []
translation_unit.inclusions do |included_file, _inclusions|
next if !included_file.start_with?(@basepath) || included_file == file
included_files << included_file
end
analysis.included_files = included_files.uniq.sort
cursor.visit_children do |cursor, parent|
if cursor.location.file && (File.dirname(file) == File.dirname(cursor.location.file) || cursor.location.file.end_with?('_impl.h'))
if parent.kind == :cursor_translation_unit
if (cursor.kind == :cursor_function && cursor.definition?) || (cursor.kind == :cursor_variable && !cursor.has_external_storage)
analysis.symbol_to_file[cursor.spelling] = cursor.location.file
if cursor.linkage == :external
# Nothing special
elsif cursor.linkage == :internal
(analysis.static_symbols << cursor.spelling).uniq!
else
fail format('Unknown linkage: %s', cursor.linkage.inspect)
end
start_offset = cursor.extent.start.offset
end_offset = cursor.extent.end.offset
end_offset += 1 if cursor.kind == :cursor_variable # The ";" isn't counted correctly by clang
if cursor.kind == :cursor_variable && (cursor.linkage == :external || cursor.linkage == :internal) &&
!cursor.type.const_qualified? && !cursor.type.array_element_type.const_qualified?
analysis.external_variables << cursor.spelling
end
analysis.file_to_symbol_positions[cursor.location.file] ||= {}
analysis.file_to_symbol_positions[cursor.location.file][cursor.spelling] = [start_offset, end_offset]
cursor.visit_children do |child_cursor, parent|
# Ignore variable definitions from the local scope
next :recurse if child_cursor.definition.semantic_parent == cursor
if child_cursor.kind == :cursor_decl_ref_expr || child_cursor.kind == :cursor_call_expr
analysis.references[cursor.spelling] ||= []
(analysis.references[cursor.spelling] << child_cursor.spelling).uniq!
end
:recurse
end
end
end
end
next :recurse
end
analysis
end
RESOLVE_MAX_DEPTH = 100
def deep_resolve(method_name, depth: 0, trail: [], global_resolved_by_parent: [], static_resolved_by_parent: [], static_base_filename: nil)
if @blacklist.include?(method_name)
puts 'ERROR: Hit blacklist entry ' + method_name
puts 'Trail: ' + trail.inspect
exit 1
end
if depth > RESOLVE_MAX_DEPTH
puts 'ERROR: Exceeded max depth'
puts method_name.inspect
puts trail.inspect
exit 1
end
base_filename = static_base_filename || @global_method_to_base_filename[method_name]
if !base_filename
(@unresolved << method_name).uniq!
return
end
analysis = @file_analysis[base_filename]
fail "could not find analysis data for #{base_filename}" if analysis.nil?
# We need to determine if we can lookup the place where the method lives
implementation_filename = analysis.symbol_to_file[method_name]
if !implementation_filename
(@unresolved << method_name).uniq!
return
end
@symbols_to_output[implementation_filename] ||= []
@symbols_to_output[implementation_filename] << method_name
(@include_files_to_output += analysis.included_files).uniq!
if @mock.key?(method_name)
# Code will be overwritten at output time, no need to investigate dependents
return
end
# Now we need to resolve all symbols called by this one
dependents = (analysis.references[method_name] || [])
global_dependents = dependents.select { |c| !analysis.static_symbols.include?(c) } - global_resolved_by_parent
static_dependents = dependents.select { |c| analysis.static_symbols.include?(c) } - static_resolved_by_parent
# First, make sure we exclude all that have been visited before
@resolved_static_by_base_filename[base_filename] ||= []
global_dependents.delete_if { |s| @resolved_global.include?(s) }
static_dependents.delete_if { |s| @resolved_static_by_base_filename[base_filename].include?(s) }
# Second, make sure we never visit any of the dependents again
global_dependents.each { |s| @resolved_global << s }
static_dependents.each { |s| @resolved_static_by_base_filename[base_filename] << s }
# Third, actually traverse into the remaining, non-visited, dependents
global_dependents.each do |symbol|
deep_resolve(
symbol, depth: depth + 1, trail: trail + [method_name],
global_resolved_by_parent: global_resolved_by_parent + global_dependents
)
end
static_dependents.each do |symbol|
deep_resolve(
symbol, depth: depth + 1, trail: trail + [method_name],
global_resolved_by_parent: global_resolved_by_parent + global_dependents,
static_resolved_by_parent: static_resolved_by_parent + static_dependents,
static_base_filename: base_filename
)
end
end
def special_include_file?(filename)
filename[/\/(reg(c|e)_[\w_]+|guc-file|qsort_tuple|repl_scanner|levenshtein|bootscanner|like_match)\.c$/] || filename[/\/[\w_]+_impl.h$/]
end
def write_out
all_thread_local_variables = []
@symbols_to_output.each do |filename, symbols|
file_thread_local_variables = []
dead_positions = (@file_to_method_and_pos[filename] || {}).dup
symbols.each do |symbol|
next if @mock.key?(symbol)
next if @external_variables.include?(symbol)
alive_pos = dead_positions[symbol]
# In some cases there are other symbols at the same location (macros), so delete by position instead of name
dead_positions.delete_if { |_,pos| pos == alive_pos }
end
full_code = File.read(filename)
str = "/*--------------------------------------------------------------------\n"
str += " * Symbols referenced in this file:\n"
symbols.each do |symbol|
str += format(" * - %s\n", symbol)
end
str += " *--------------------------------------------------------------------\n"
str += " */\n\n"
next_start_pos = 0
dead_positions.each do |symbol, pos|
fail format("Position overrun for %s in %s, next_start_pos (%d) > file length (%d)", symbol, filename, next_start_pos, full_code.size) if next_start_pos > full_code.size
fail format("Position overrun for %s in %s, dead position pos[0]-1 (%d) > file length (%d)", symbol, filename, pos[0]-1, full_code.size) if pos[0]-1 > full_code.size
str += full_code[next_start_pos...(pos[0]-1)]
skipped_code = full_code[(pos[0]-1)...pos[1]]
if @mock.key?(symbol)
str += "\n" + @mock[symbol] + "\n"
elsif @external_variables.include?(symbol) && symbols.include?(symbol)
file_thread_local_variables << symbol
if skipped_code.include?('static')
str += "\n" + skipped_code.strip.gsub('static', 'static __thread') + "\n"
else
str += "\n__thread " + skipped_code.strip + "\n"
end
else
# In the off chance that part of a macro is before a symbol (e.g. ifdef),
# but the closing part is inside (e.g. endif) we need to output all macros inside skipped parts
str += "\n" + skipped_code.scan(/^(#\s*(?:include|define|undef|if|ifdef|ifndef|else|endif))((?:[^\n]*\\\s*\n)*)([^\n]*)$/m).map { |m| m.compact.join }.join("\n")
end
next_start_pos = pos[1]
end
str += full_code[next_start_pos..-1]
# In some cases we also need to take care of definitions in the same file
file_thread_local_variables.each do |variable|
str.gsub!(/(PGDLLIMPORT|extern)\s+(const|volatile)?\s*(\w+)\s+(\*{0,2})#{variable}(\[\])?;/, "\\1 __thread \\2 \\3 \\4#{variable}\\5;")
end
all_thread_local_variables += file_thread_local_variables
if special_include_file?(filename)
out_name = File.basename(filename)
else
out_name = filename.gsub(%r{^#{@basepath}}, '').gsub('/', '_')
end
File.write(@out_path + out_name, str)
end
@include_files_to_output.each do |include_file|
next if special_include_file?(include_file)
if include_file.start_with?(@basepath + 'src/include')
out_file = @out_path + include_file.gsub(%r{^#{@basepath}src/}, '')
else
out_file = @out_path + 'include/' + File.basename(include_file)
end
code = File.read(include_file)
all_thread_local_variables.each do |variable|
code.gsub!(/(PGDLLIMPORT|extern)\s+(const|volatile)?\s*(\w+)\s+(\*{0,2})#{variable}(\[\])?;/, "\\1 __thread \\2 \\3 \\4#{variable}\\5;")
end
FileUtils.mkdir_p File.dirname(out_file)
File.write(out_file, code)
end
end
end
runner = Runner.new
runner.run
runner.blacklist('SearchSysCache')
runner.blacklist('heap_open')
runner.blacklist('relation_open')
runner.blacklist('RelnameGetRelid')
runner.blacklist('ProcessClientWriteInterrupt')
runner.blacklist('typeStringToTypeName')
runner.blacklist('LWLockAcquire')
runner.blacklist('SPI_freeplan')
runner.blacklist('get_ps_display')
runner.blacklist('pq_beginmessage')
# Mocks REQUIRED for basic operations (error handling, memory management)
runner.mock('ProcessInterrupts', 'void ProcessInterrupts(void) {}') # Required by errfinish
runner.mock('PqCommMethods', 'PQcommMethods *PqCommMethods = NULL;') # Required by errfinish
runner.mock('proc_exit', 'void proc_exit(int code) { printf("Terminating process due to FATAL error\n"); exit(1); }') # Required by errfinish (we use PG_TRY/PG_CATCH, so this should never be reached in practice)
runner.mock('send_message_to_server_log', 'static void send_message_to_server_log(ErrorData *edata) {}')
runner.mock('send_message_to_frontend', 'static void send_message_to_frontend(ErrorData *edata) {}')
# Mocks REQUIRED for PL/pgSQL parsing
runner.mock('format_type_be', 'char * format_type_be(Oid type_oid) { return pstrdup("-"); }')
runner.mock('build_row_from_class', 'static PLpgSQL_row *build_row_from_class(Oid classOid) { return NULL; }')
runner.mock('plpgsql_build_datatype', 'PLpgSQL_type * plpgsql_build_datatype(Oid typeOid, int32 typmod, Oid collation) { PLpgSQL_type *typ; typ = (PLpgSQL_type *) palloc0(sizeof(PLpgSQL_type)); typ->typname = pstrdup("UNKNOWN"); typ->ttype = PLPGSQL_TTYPE_SCALAR; return typ; }')
runner.mock('parse_datatype', 'static PLpgSQL_type * parse_datatype(const char *string, int location) { PLpgSQL_type *typ; typ = (PLpgSQL_type *) palloc0(sizeof(PLpgSQL_type)); typ->typname = pstrdup(string); typ->ttype = PLPGSQL_TTYPE_SCALAR; return typ; }')
runner.mock('get_collation_oid', 'Oid get_collation_oid(List *name, bool missing_ok) { return -1; }')
runner.mock('plpgsql_parse_wordtype', 'PLpgSQL_type * plpgsql_parse_wordtype(char *ident) { return NULL; }')
runner.mock('plpgsql_parse_wordrowtype', 'PLpgSQL_type * plpgsql_parse_wordrowtype(char *ident) { return NULL; }')
runner.mock('plpgsql_parse_cwordtype', 'PLpgSQL_type * plpgsql_parse_cwordtype(List *idents) { return NULL; }')
runner.mock('plpgsql_parse_cwordrowtype', 'PLpgSQL_type * plpgsql_parse_cwordrowtype(List *idents) { return NULL; }')
runner.mock('function_parse_error_transpose', 'bool function_parse_error_transpose(const char *prosrc) { return false; }')
runner.mock('free_expr', "static void free_expr(PLpgSQL_expr *expr) {}") # This would free a cached plan, which does not apply to us
runner.mock('make_return_stmt', %(
static PLpgSQL_stmt *
make_return_stmt(int location)
{
PLpgSQL_stmt_return *new;
Assert(plpgsql_curr_compile->fn_rettype == VOIDOID);
new = palloc0(sizeof(PLpgSQL_stmt_return));
new->cmd_type = PLPGSQL_STMT_RETURN;
new->lineno = plpgsql_location_to_lineno(location);
new->expr = NULL;
new->retvarno = -1;
int tok = yylex();
if (tok != ';')
{
plpgsql_push_back_token(tok);
new->expr = read_sql_expression(';', ";");
}
return (PLpgSQL_stmt *) new;
}
)) # We're always working with fn_rettype = VOIDOID, due to our use of plpgsql_compile_inline
## ---
# SQL Parsing
runner.deep_resolve('raw_parser')
# PL/pgSQL Parsing
runner.deep_resolve('plpgsql_compile_inline')
runner.deep_resolve('plpgsql_free_function_memory')
# Basic Postgres needed to call parser
runner.deep_resolve('SetDatabaseEncoding')
# Memory management needed to call parser
runner.deep_resolve('MemoryContextInit')
runner.deep_resolve('AllocSetContextCreate')
runner.deep_resolve('MemoryContextSwitchTo')
runner.deep_resolve('CurrentMemoryContext')
runner.deep_resolve('MemoryContextDelete')
runner.deep_resolve('palloc0')
# Error handling needed to call parser
runner.deep_resolve('CopyErrorData')
runner.deep_resolve('FlushErrorState')
# Needed for output funcs
runner.deep_resolve('bms_first_member')
runner.deep_resolve('bms_free')
# Needed for normalize
runner.deep_resolve('pg_qsort')
runner.deep_resolve('raw_expression_tree_walker')
# SHA1 needed for fingerprinting
runner.deep_resolve('sha1_result')
runner.deep_resolve('sha1_init')
runner.deep_resolve('sha1_loop')
runner.write_out
#puts runner.unresolved.inspect
# Debugging:
# clang -Xclang -ast-dump -fsyntax-only -I src/include/ src/backend/utils/init/globals.c