ee/app/models/geo_node_status.rb (361 lines of code) (raw):
# frozen_string_literal: true
class GeoNodeStatus < ApplicationRecord
include ShaAttribute
belongs_to :geo_node
delegate :selective_sync_type, to: :geo_node
after_initialize :initialize_feature_flags
attr_accessor :storage_shards
attr_accessor :repository_verification_enabled
# Prometheus metrics, no need to store them in the database
attr_accessor :event_log_max_id, :repository_created_max_id, :repository_updated_max_id,
:repository_deleted_max_id, :repository_renamed_max_id, :repositories_changed_max_id,
:lfs_object_deleted_max_id, :job_artifact_deleted_max_id,
:lfs_objects_registry_count, :job_artifacts_registry_count, :attachments_registry_count,
:hashed_storage_migrated_max_id, :hashed_storage_attachments_max_id,
:repositories_checked_count, :repositories_checked_failed_count
sha_attribute :storage_configuration_digest
alias_attribute :repositories_count, :projects_count
alias_attribute :wikis_count, :projects_count
attribute_method_suffix '_timestamp', '_timestamp='
alias_attribute :last_successful_status_check_timestamp, :last_successful_status_check_at_timestamp
alias_attribute :last_event_timestamp, :last_event_date_timestamp
alias_attribute :cursor_last_event_timestamp, :cursor_last_event_date_timestamp
# Be sure to keep this consistent with Prometheus naming conventions
PROMETHEUS_METRICS = {
db_replication_lag_seconds: 'Database replication lag (seconds)',
repositories_count: 'Total number of repositories available on primary',
repositories_synced_count: 'Number of repositories synced on secondary',
repositories_failed_count: 'Number of repositories failed to sync on secondary',
wikis_synced_count: 'Number of wikis synced on secondary',
wikis_failed_count: 'Number of wikis failed to sync on secondary',
repositories_checksummed_count: 'Number of repositories checksummed on primary',
repositories_checksum_failed_count: 'Number of repositories failed to calculate the checksum on primary',
wikis_checksummed_count: 'Number of wikis checksummed on primary',
wikis_checksum_failed_count: 'Number of wikis failed to calculate the checksum on primary',
repositories_verified_count: 'Number of repositories verified on secondary',
repositories_verification_failed_count: 'Number of repositories failed to verify on secondary',
repositories_checksum_mismatch_count: 'Number of repositories that checksum mismatch on secondary',
wikis_verified_count: 'Number of wikis verified on secondary',
wikis_verification_failed_count: 'Number of wikis failed to verify on secondary',
wikis_checksum_mismatch_count: 'Number of wikis that checksum mismatch on secondary',
lfs_objects_count: 'Total number of syncable LFS objects available on primary',
lfs_objects_synced_count: 'Number of syncable LFS objects synced on secondary',
lfs_objects_failed_count: 'Number of syncable LFS objects failed to sync on secondary',
lfs_objects_registry_count: 'Number of LFS objects in the registry',
lfs_objects_synced_missing_on_primary_count: 'Number of LFS objects marked as synced due to the file missing on the primary',
job_artifacts_count: 'Total number of syncable job artifacts available on primary',
job_artifacts_synced_count: 'Number of syncable job artifacts synced on secondary',
job_artifacts_failed_count: 'Number of syncable job artifacts failed to sync on secondary',
job_artifacts_registry_count: 'Number of job artifacts in the registry',
job_artifacts_synced_missing_on_primary_count: 'Number of job artifacts marked as synced due to the file missing on the primary',
attachments_count: 'Total number of syncable file attachments available on primary',
attachments_synced_count: 'Number of syncable file attachments synced on secondary',
attachments_failed_count: 'Number of syncable file attachments failed to sync on secondary',
attachments_registry_count: 'Number of attachments in the registry',
attachments_synced_missing_on_primary_count: 'Number of attachments marked as synced due to the file missing on the primary',
replication_slots_count: 'Total number of replication slots on the primary',
replication_slots_used_count: 'Number of replication slots in use on the primary',
replication_slots_max_retained_wal_bytes: 'Maximum number of bytes retained in the WAL on the primary',
last_event_id: 'Database ID of the latest event log entry on the primary',
last_event_timestamp: 'Time of the latest event log entry on the primary',
cursor_last_event_id: 'Last database ID of the event log processed by the secondary',
cursor_last_event_timestamp: 'Time of the event log processed by the secondary',
last_successful_status_check_timestamp: 'Time when Geo node status was updated internally',
status_message: 'Summary of health status',
event_log_max_id: 'Highest ID present in the Geo event log',
repository_created_max_id: 'Highest ID present in repositories created',
repository_updated_max_id: 'Highest ID present in repositories updated',
repository_deleted_max_id: 'Highest ID present in repositories deleted',
repository_renamed_max_id: 'Highest ID present in repositories renamed',
repositories_changed_max_id: 'Highest ID present in repositories changed',
lfs_object_deleted_max_id: 'Highest ID present in LFS objects deleted',
job_artifact_deleted_max_id: 'Highest ID present in job artifacts deleted',
hashed_storage_migrated_max_id: 'Highest ID present in projects migrated to hashed storage',
hashed_storage_attachments_max_id: 'Highest ID present in attachments migrated to hashed storage',
repositories_checked_count: 'Number of repositories checked',
repositories_checked_failed_count: 'Number of failed repositories checked',
repositories_retrying_verification_count: 'Number of repositories verification failures that Geo is actively trying to correct on secondary',
wikis_retrying_verification_count: 'Number of wikis verification failures that Geo is actively trying to correct on secondary',
container_repositories_count: 'Total number of syncable container repositories available on primary',
container_repositories_synced_count: 'Number of syncable container repositories synced on secondary',
container_repositories_failed_count: 'Number of syncable container repositories failed to sync on secondary',
container_repositories_registry_count: 'Number of container repositories in the registry',
design_repositories_count: 'Total number of syncable design repositories available on primary',
design_repositories_synced_count: 'Number of syncable design repositories synced on secondary',
design_repositories_failed_count: 'Number of syncable design repositories failed to sync on secondary',
design_repositories_registry_count: 'Number of design repositories in the registry'
}.freeze
EXPIRATION_IN_MINUTES = 5
HEALTHY_STATUS = 'Healthy'.freeze
UNHEALTHY_STATUS = 'Unhealthy'.freeze
def self.current_node_status
current_node = Gitlab::Geo.current_node
return unless current_node
status = current_node.find_or_build_status
status.load_data_from_current_node
status.save if Gitlab::Geo.primary?
status
end
def self.fast_current_node_status
attrs = Rails.cache.read(cache_key)
if attrs
new(attrs)
else
spawn_worker
nil
end
end
def self.spawn_worker
::Geo::MetricsUpdateWorker.perform_async
end
def self.cache_key
"geo-node:#{Gitlab::Geo.current_node.id}:status"
end
def self.from_json(json_data)
json_data.slice!(*allowed_params)
GeoNodeStatus.new(HashWithIndifferentAccess.new(json_data))
end
EXCLUDED_PARAMS = %w[id created_at].freeze
EXTRA_PARAMS = %w[
last_event_timestamp
cursor_last_event_timestamp
storage_shards
].freeze
def self.allowed_params
self.column_names - EXCLUDED_PARAMS + EXTRA_PARAMS
end
def initialize_feature_flags
self.repository_verification_enabled = Gitlab::Geo.repository_verification_enabled?
end
def update_cache!
Rails.cache.write(self.class.cache_key, attributes)
end
def load_data_from_current_node
latest_event = Geo::EventLog.latest_event
self.last_event_id = latest_event&.id
self.last_event_date = latest_event&.created_at
self.last_successful_status_check_at = Time.now
self.storage_shards = StorageShard.all
self.storage_configuration_digest = StorageShard.build_digest
self.version = Gitlab::VERSION
self.revision = Gitlab.revision
self.projects_count = geo_node.projects.count
load_status_message
load_event_data
load_primary_data
load_secondary_data
load_repository_check_data
load_verification_data
end
def current_cursor_last_event_id
return unless Gitlab::Geo.secondary?
min_gap_id = ::Gitlab::Geo::EventGapTracking.min_gap_id
last_processed_id = Geo::EventLogState.last_processed&.event_id
[min_gap_id, last_processed_id].compact.min
end
def healthy?
!outdated? && status_message_healthy?
end
def health
if outdated?
return "Status has not been updated in the past #{EXPIRATION_IN_MINUTES} minutes"
end
status_message
end
def health_status
healthy? ? HEALTHY_STATUS : UNHEALTHY_STATUS
end
def outdated?
return false unless updated_at
updated_at < EXPIRATION_IN_MINUTES.minutes.ago
end
def status_message_healthy?
status_message.blank? || status_message == HEALTHY_STATUS
end
def attribute_timestamp(attr)
self[attr].to_i
end
def attribute_timestamp=(attr, value)
self[attr] = Time.at(value)
end
def self.attr_in_percentage(attr_name, count, total)
define_method("#{attr_name}_in_percentage") do
return 0 if read_attribute(total).to_i.zero?
(read_attribute(count).to_f / read_attribute(total).to_f) * 100.0
end
end
attr_in_percentage :repositories_synced, :repositories_synced_count, :repositories_count
attr_in_percentage :repositories_checksummed, :repositories_checksummed_count, :repositories_count
attr_in_percentage :repositories_verified, :repositories_verified_count, :repositories_count
attr_in_percentage :repositories_checked, :repositories_checked_count, :repositories_count
attr_in_percentage :wikis_synced, :wikis_synced_count, :wikis_count
attr_in_percentage :wikis_checksummed, :wikis_checksummed_count, :wikis_count
attr_in_percentage :wikis_verified, :wikis_verified_count, :wikis_count
attr_in_percentage :lfs_objects_synced, :lfs_objects_synced_count, :lfs_objects_count
attr_in_percentage :job_artifacts_synced, :job_artifacts_synced_count, :job_artifacts_count
attr_in_percentage :attachments_synced, :attachments_synced_count, :attachments_count
attr_in_percentage :replication_slots_used, :replication_slots_used_count, :replication_slots_count
attr_in_percentage :container_repositories_synced, :container_repositories_synced_count, :container_repositories_count
attr_in_percentage :design_repositories_synced, :design_repositories_synced_count, :design_repositories_count
def storage_shards_match?
return true if geo_node.primary?
return false unless storage_configuration_digest && primary_storage_digest
storage_configuration_digest == primary_storage_digest
end
def [](key)
public_send(key) # rubocop:disable GitlabSecurity/PublicSend
end
private
def load_status_message
self.status_message =
begin
HealthCheck::Utils.process_checks(['geo'])
rescue NotImplementedError => e
e.to_s
end
end
def load_event_data
self.event_log_max_id = Geo::EventLog.maximum(:id)
self.repository_created_max_id = Geo::RepositoryCreatedEvent.maximum(:id)
self.repository_updated_max_id = Geo::RepositoryUpdatedEvent.maximum(:id)
self.repository_deleted_max_id = Geo::RepositoryDeletedEvent.maximum(:id)
self.repository_renamed_max_id = Geo::RepositoryRenamedEvent.maximum(:id)
self.repositories_changed_max_id = Geo::RepositoriesChangedEvent.maximum(:id)
self.lfs_object_deleted_max_id = Geo::LfsObjectDeletedEvent.maximum(:id)
self.job_artifact_deleted_max_id = Geo::JobArtifactDeletedEvent.maximum(:id)
self.hashed_storage_migrated_max_id = Geo::HashedStorageMigratedEvent.maximum(:id)
self.hashed_storage_attachments_max_id = Geo::HashedStorageAttachmentsEvent.maximum(:id)
end
def load_primary_data
return unless Gitlab::Geo.primary?
self.lfs_objects_count = LfsObject.count
self.job_artifacts_count = Ci::JobArtifact.not_expired.count
self.attachments_count = Upload.count
self.replication_slots_count = geo_node.replication_slots_count
self.replication_slots_used_count = geo_node.replication_slots_used_count
self.replication_slots_max_retained_wal_bytes = geo_node.replication_slots_max_retained_wal_bytes
end
def load_secondary_data
return unless Gitlab::Geo.secondary?
self.db_replication_lag_seconds = Gitlab::Geo::HealthCheck.new.db_replication_lag_seconds
self.cursor_last_event_id = current_cursor_last_event_id
self.cursor_last_event_date = Geo::EventLog.find_by(id: self.cursor_last_event_id)&.created_at
self.repositories_synced_count = registries_for_synced_projects(:repository).count
self.repositories_failed_count = registries_for_failed_projects(:repository).count
self.wikis_synced_count = registries_for_synced_projects(:wiki).count
self.wikis_failed_count = registries_for_failed_projects(:wiki).count
load_lfs_objects_data
load_job_artifacts_data
load_attachments_data
load_container_registry_data
load_designs_data
end
def load_lfs_objects_data
self.lfs_objects_count = lfs_objects_finder.count_syncable
self.lfs_objects_synced_count = lfs_objects_finder.count_synced
self.lfs_objects_failed_count = lfs_objects_finder.count_failed
self.lfs_objects_registry_count = lfs_objects_finder.count_registry
self.lfs_objects_synced_missing_on_primary_count = lfs_objects_finder.count_synced_missing_on_primary
end
def load_job_artifacts_data
self.job_artifacts_count = job_artifacts_finder.count_syncable
self.job_artifacts_synced_count = job_artifacts_finder.count_synced
self.job_artifacts_failed_count = job_artifacts_finder.count_failed
self.job_artifacts_registry_count = job_artifacts_finder.count_registry
self.job_artifacts_synced_missing_on_primary_count = job_artifacts_finder.count_synced_missing_on_primary
end
def load_attachments_data
self.attachments_count = attachments_finder.count_syncable
self.attachments_synced_count = attachments_finder.count_synced
self.attachments_failed_count = attachments_finder.count_failed
self.attachments_registry_count = attachments_finder.count_registry
self.attachments_synced_missing_on_primary_count = attachments_finder.count_synced_missing_on_primary
end
def load_container_registry_data
self.container_repositories_count = container_registry_finder.count_syncable
self.container_repositories_synced_count = container_registry_finder.count_synced
self.container_repositories_failed_count = container_registry_finder.count_failed
self.container_repositories_registry_count = container_registry_finder.count_registry
end
def load_designs_data
self.design_repositories_count = design_registry_finder.count_syncable
self.design_repositories_synced_count = design_registry_finder.count_synced
self.design_repositories_failed_count = design_registry_finder.count_failed
self.design_repositories_registry_count = design_registry_finder.count_registry
end
def load_repository_check_data
if Gitlab::Geo.primary?
self.repositories_checked_count = Project.where.not(last_repository_check_at: nil).count
self.repositories_checked_failed_count = Project.where(last_repository_check_failed: true).count
elsif Gitlab::Geo.secondary?
self.repositories_checked_count = Geo::ProjectRegistry.where.not(last_repository_check_at: nil).count
self.repositories_checked_failed_count = Geo::ProjectRegistry.where(last_repository_check_failed: true).count
end
end
def load_verification_data
return unless repository_verification_enabled
if Gitlab::Geo.primary?
self.repositories_checksummed_count = repository_verification_finder.count_verified_repositories
self.repositories_checksum_failed_count = repository_verification_finder.count_verification_failed_repositories
self.wikis_checksummed_count = repository_verification_finder.count_verified_wikis
self.wikis_checksum_failed_count = repository_verification_finder.count_verification_failed_wikis
elsif Gitlab::Geo.secondary?
self.repositories_verified_count = registries_for_verified_projects(:repository).count
self.repositories_verification_failed_count = registries_for_verification_failed_projects(:repository).count
self.repositories_checksum_mismatch_count = registries_for_mismatch_projects(:repository).count
self.wikis_verified_count = registries_for_verified_projects(:wiki).count
self.wikis_verification_failed_count = registries_for_verification_failed_projects(:wiki).count
self.wikis_checksum_mismatch_count = registries_for_mismatch_projects(:wiki).count
self.repositories_retrying_verification_count = registries_retrying_verification(:repository).count
self.wikis_retrying_verification_count = registries_retrying_verification(:wiki).count
end
end
def primary_storage_digest
@primary_storage_digest ||= Gitlab::Geo.primary_node.find_or_build_status.storage_configuration_digest
end
def attachments_finder
@attachments_finder ||= Geo::AttachmentRegistryFinder.new(current_node_id: geo_node.id)
end
def lfs_objects_finder
@lfs_objects_finder ||= Geo::LfsObjectRegistryFinder.new(current_node_id: geo_node.id)
end
def job_artifacts_finder
@job_artifacts_finder ||= Geo::JobArtifactRegistryFinder.new(current_node_id: geo_node.id)
end
def container_registry_finder
@container_registry_finder ||= Geo::ContainerRepositoryRegistryFinder.new(current_node_id: geo_node.id)
end
def design_registry_finder
@design_registry_finder ||= Geo::DesignRegistryFinder.new(current_node_id: geo_node.id)
end
def registries_for_synced_projects(type)
Geo::ProjectRegistrySyncedFinder
.new(current_node: geo_node, type: type)
.execute
end
def registries_for_failed_projects(type)
Geo::ProjectRegistrySyncFailedFinder
.new(current_node: geo_node, type: type)
.execute
end
def registries_for_verified_projects(type)
Geo::ProjectRegistryVerifiedFinder
.new(current_node: geo_node, type: type)
.execute
end
def registries_for_verification_failed_projects(type)
Geo::ProjectRegistryVerificationFailedFinder
.new(current_node: geo_node, type: type)
.execute
end
def registries_for_mismatch_projects(type)
Geo::ProjectRegistryMismatchFinder
.new(current_node: geo_node, type: type)
.execute
end
def registries_retrying_verification(type)
Geo::ProjectRegistryRetryingVerificationFinder
.new(current_node: geo_node, type: type)
.execute
end
def repository_verification_finder
@repository_verification_finder ||= Geo::RepositoryVerificationFinder.new
end
end