def SanFailback()

in automation/tinc/main/ext/qautils/gppylib/programs/clsRecoverSegment.py [0:0]


    def SanFailback(self, array_config, gpEnv):

        # Get the configuration information maps.
        (san_mounts, san_mount_by_dbid) = array_config.getSanConfigMaps()

        # 1 Get the failed segments
        bad_segs = {}
        for dbid, v in san_mount_by_dbid.iteritems():
            (status, content, mountlist) = v
            if status == 'd':
                self.logger.info('Bad segment with dbid %d' % dbid)
                bad_segs[dbid] = (status, content, mountlist)

        # 2 Get the failed mountpoints.
        bad_mounts = {}
        for mount_id, v in san_mounts.iteritems():
            if v['active'] == 'm':
                self.logger.info('Bad mountpoint with id %d' % mount_id)
                bad_mounts[mount_id] = v

        # 3 Verify that the required hosts are back up (this may reduce the number of recoverable segments)
        recoverable_mps = {}
        for mount_id, v in bad_mounts.iteritems():
            try:
                unix.Echo.remote('check host', 'Success', v['primaryhost'])
                recoverable_mps[mount_id] = v
            except:
                # Host not available, not added to recoverable_mps.  We'll ignore
                # because there may be others we can recover
                pass

        # 4
        # From the recoverable mountpoints, we should now be able to identify the
        # mountpoints required to recover for the segments. A segment is recoverable
        # if all of its mountpoints are recoverable.
        recoverable_segs = {}
        for dbid, v in bad_segs.iteritems():
            (status, content, mountlist) = v
            recoverable = True
            for mount_id in mountlist:
                if not recoverable_mps.has_key(mount_id):
                    recoverable = False
                    break
            if recoverable:
                recoverable_segs[dbid] = v
            else:
                self.logger.warning('Unrecoverable segment dbid %d' % (dbid))

        if len(recoverable_segs) == 0:
            raise Exception("Found no recoverable segments.")

        # 4 Stop GPDB.
        e = os.system('gpstop -aq -d %s' % (os.environ.get('MASTER_DATA_DIRECTORY')))
        ok = not e
        if not ok:
            self.logger.error('Failed to shutdown Greenplum Database: segment recovery cannot proceed.')
            raise Exception("Failed to shutdown GPDB. Segment recovery failed.")
        else:
            self.logger.info('Successfully shutdown the Greenplum Database')

        # 5 Move mountpoints
        #   For each recoverable seg, walk its mountlist.
        # 5a
        #   unmount on failover host.            
        # 5b
        #   reconnect to primary.
        # 5c
        #   mount on primary.
        mount_done = {}
        for dbid, v in recoverable_segs.iteritems():
            (status, content, mountlist) = v

            for mount_id in mountlist:
                if mount_done.has_key(mount_id):
                    continue # already moved this

                if self.SanFailback_mountpoint(mount_id, recoverable_mps[mount_id]) == 0:
                    # TODO: some kind of error handling here ??
                    mount_done[mount_id] = True
                else:
                    mount_done[mount_id] = False

        self.logger.debug('Completed mount-recovery:')
        for mount_id, v in mount_done.iteritems():
            if v:
                self.logger.debug('mount-id %d ---> TRUE' % mount_id)
            else:
                self.logger.debug('mount-id %d ---> FALSE' % mount_id)

        # N - 3
        # Start GPDB in admin-mode
        os.putenv('GPSTART_INTERNAL_MASTER_ONLY', '1')
        e = os.system('gpstart -m -d %s' % (os.environ.get('MASTER_DATA_DIRECTORY')))
        ok = not e
        if not ok:
            self.logger.error('Failed to bring Greenplum Database up in management mode: segment recovery failed')
            raise Exception("Failed to start GPDB in management mode.")
        else:
            self.logger.info('Greenplum Database restarted for configuration update')

        # N - 2
        # Update configuration

        # Open a connection to the DB.
        conn = None
        try:
            db_url = dbconn.DbURL(port=gpEnv.getMasterPort(), dbname='template1')

            conn = dbconn.connect(db_url, utility=True)

            dbconn.execSQL(conn, "BEGIN")

            self.logger.debug('Starting Transaction')

            # Update gp_san_configuration
            for mount_id, v in mount_done.iteritems():
                self.logger.debug('Checking Mount id %d' % mount_id)
                if v:
                    sql = 'UPDATE gp_san_configuration SET active_host=\'p\' WHERE mountid=%d' % mount_id
                    self.logger.debug('Issuing SQL [%s]' % sql)
                    dbconn.executeUpdateOrInsert(conn, sql, 1)

                    history_message = "GPRECOVERSEG: san-mount-id %d set active_host to primary" % (mount_id)
                    sql = 'INSERT INTO gp_configuration_history values (now(), -1, \'%s\')' % history_message
                    self.logger.debug('Issuing SQL [%s]' % sql)
                    dbconn.executeUpdateOrInsert(conn, sql, 1)

            # Update gp_segment_configuration
            for dbid, v in recoverable_segs.iteritems():
                (status, content, mountlist) = v

                self.logger.debug('Checking dbid id %d' % dbid)

                all_mountpoints = True
                for mount_id, v in mount_done.iteritems():
                    self.logger.debug('Scanning mountid %d in dbid id %d' % (mount_id, dbid))
                    if not v:
                        self.logger.debug('Mountid %d --> False' % mount_id)
                        all_mountpoints = False
                    else:
                        self.logger.debug('Mountid %d --> True' % mount_id)

                if all_mountpoints:
                    sql = 'UPDATE gp_segment_configuration SET status = \'u\' where dbid = %d' % dbid

                    self.logger.debug('Issuing SQL [%s]' % sql)
                    dbconn.executeUpdateOrInsert(conn, sql, 1)

                    sql = 'UPDATE gp_segment_configuration SET role = preferred_role where content = %d' % content
                    self.logger.debug('Issuing SQL [%s]' % sql)
                    dbconn.executeUpdateOrInsert(conn, sql, 2)

                    history_message = "GPRECOVERSEG: content %d, dbid %d moved to primary host" % (content, dbid)
                    sql = 'INSERT INTO gp_configuration_history values (now(), %d, \'%s\')' % (dbid, history_message)
                    self.logger.debug('Issuing SQL [%s]' % sql)
                    dbconn.executeUpdateOrInsert(conn, sql, 1)
                else:
                    self.logger.info('Failed to recover sufficient mountpoints for dbid %d' % dbid)

            self.logger.debug('Committing our updates.')
            dbconn.execSQL(conn, "COMMIT")
        finally:
            if conn:
                conn.close()

        # N - 1 
        # Stop GPDB-admin-mode
        e = os.system('gpstop -m -d %s' % (os.environ.get('MASTER_DATA_DIRECTORY')))
        ok = not e
        if not ok:
            self.logger.error('Failed to stop Greenplum Database up in management mode: segment recovery failed')
            raise Exception("Failed to stop GPDB, from management mode.")
        else:
            self.logger.info('Greenplum Database stopped, preparing for full restart.')

        # N Start GPDB
        e = os.system('gpstart -aq -d %s' % (os.environ.get('MASTER_DATA_DIRECTORY')))
        ok = not e
        if not ok:
            self.logger.error('Failed to restart Greenplum Database: segment recovery failed')
            raise Exception("Failed to restart GPDB.")
        else:
            self.logger.info('Successfully restarted the Greenplum Database')

        configInterface.getConfigurationProvider().sendPgElogFromMaster( "SAN recovery has completed.", True)

        return 0