def scan()

in src/plugins/scanners/pipermail.py [0:0]


def scan(KibbleBit, source):
    url = source['sourceURL']
    pipermail = re.match(r"(https?://.+/(archives|pipermail)/.+?)/?$", url)
    if pipermail:
        KibbleBit.pprint("Scanning Pipermail source %s" % url)
        skipped = 0
        jsa = []
        jsp = []
        source['steps']['mail'] = {
            'time': time.time(),
            'status': 'Downloading Pipermail statistics',
            'running': True,
            'good': True
        }
        KibbleBit.updateSource(source)
        
        dt = time.gmtime(time.time())
        firstYear = 1970
        year = dt[0]
        month = dt[1]
        if month <= 0:
            month += 12
            year -= 1
        months = 0
        
        knowns = {}
        
        # While we have older archives, continue to parse
        monthNames = ['December', 'January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']
        while firstYear <= year:
            gzurl = "%s/%04u-%s.txt.gz" % (url, year, monthNames[month])
            pd = datetime.date(year, month, 1).timetuple()
            dhash = hashlib.sha224((("%s %s") % (source['organisation'], gzurl)).encode('ascii', errors='replace')).hexdigest()
            found = False
            found = KibbleBit.exists('mailstats', dhash)
            if months <= 1 or not found: # Always parse this month's stats and the previous month :)
                months += 1
                mailFile = plugins.utils.urlmisc.unzip(gzurl)
                if mailFile:
                    try:
                        skipped = 0
                        messages = mailbox.mbox(mailFile)
                        
                        rawtopics = {}
                        posters = {}
                        no_posters = 0
                        emails = 0
                        senders = {}
                        for message in messages:
                            emails += 1
                            sender = message['from']
                            name = sender
                            if not 'subject' in message or not message['subject'] or not 'from' in message or not message['from']:
                                continue
                            
                            irt = message.get('in-reply-to', None)
                            if not irt and message.get('references'):
                                irt = message.get('references').split("\n")[0].strip()
                            replyto = None
                            if irt and irt in senders:
                                replyto = senders[irt]
                                print("This is a reply to %s" % replyto)
                            raw_subject = re.sub(r"^[a-zA-Z]+\s*:\s*", "", message['subject'], count=10)
                            raw_subject = re.sub(r"[\r\n\t]+", "", raw_subject, count=10)
                            if not raw_subject in rawtopics:
                                rawtopics[raw_subject] = 0
                            rawtopics[raw_subject] += 1
                            m = re.match(r"(.+?) at (.+?) \((.*)\)$", message['from'], flags=re.UNICODE)
                            if m:
                                name = m.group(3).strip()
                                sender = m.group(1) + "@" + m.group(2)
                            else:
                                m = re.match(r"(.+)\s*<(.+)>", message['from'], flags=re.UNICODE)
                                if m:
                                    name = m.group(1).replace('"', "").strip()
                                    sender = m.group(2)
                            if not sender in posters:
                                posters[sender] = {
                                    'name': name,
                                    'email': sender
                                }
                            senders[message.get('message-id', "??")] = sender
                            mdate = email.utils.parsedate_tz(message['date'])
                            mdatestring = time.strftime("%Y/%m/%d %H:%M:%S", time.gmtime(email.utils.mktime_tz(mdate)))
                            if not sender in knowns:
                                sid = hashlib.sha1( ("%s%s" % (source['organisation'], sender)).encode('ascii', errors='replace')).hexdigest()
                                knowns[sender] = KibbleBit.exists('person', sid)
                            if not sender in knowns:
                                KibbleBit.append('person',
                                    {
                                    'name': name,
                                    'email': sender,
                                    'organisation': source['organisation'],
                                    'id' :hashlib.sha1( ("%s%s" % (source['organisation'], sender)).encode('ascii', errors='replace')).hexdigest()
                                })
                                knowns[sender] = True
                            jse = {
                                'organisation': source['organisation'],
                                'sourceURL': source['sourceURL'],
                                'sourceID': source['sourceID'],
                                'date': mdatestring,
                                'sender': sender,
                                'replyto': replyto,
                                'subject': message['subject'],
                                'address': sender,
                                'ts': email.utils.mktime_tz(mdate),
                                'id': message['message-id']
                            }
                            KibbleBit.append('email', jse)
                            
                        for sender in posters:
                            no_posters += 1
                        i = 0
                        topics = 0
                        for key in rawtopics:
                            topics += 1
                        for key in reversed(sorted(rawtopics, key= lambda x: x)):
                            val = rawtopics[key]
                            i += 1
                            if i > 10:
                                break
                            KibbleBit.pprint("Found top 10: %s (%s emails)" % (key, val))
                            shash = hashlib.sha224(key.encode('ascii', errors='replace')).hexdigest()
                            md = time.strftime("%Y/%m/%d %H:%M:%S", pd)
                            mlhash = hashlib.sha224(( ("%s%s%s%s") % (key, source['sourceURL'], source['organisation'], md)).encode('ascii', errors='replace')).hexdigest() # one unique id per month per mail thread
                            jst = {
                                'organisation': source['organisation'],
                                'sourceURL': source['sourceURL'],
                                'sourceID': source['sourceID'],
                                'date': md,
                                'emails': val,
                                'shash': shash,
                                'subject': key,
                                'ts': time.mktime(pd),
                                'id': mlhash
                            }
                            KibbleBit.index('mailtop', mlhash, jst)
                        
                        jso = {
                            'organisation': source['organisation'],
                            'sourceURL': source['sourceURL'],
                            'sourceID': source['sourceID'],
                            'date': time.strftime("%Y/%m/%d %H:%M:%S", pd),
                            'authors': no_posters,
                            'emails': emails,
                            'topics': topics
                        }
                        KibbleBit.index('mailstats', dhash, jso)               
                        
                        os.unlink(mailFile)
                    except Exception as err:
                        KibbleBit.pprint("Couldn't parse %s, skipping: %s" % (gzurl, err))
                        skipped += 1
                        if skipped > 12:
                            KibbleBit.pprint("12 skips in a row, breaking off (no more data?)")
                            break
                else:
                    KibbleBit.pprint("Couldn't find %s, skipping." % gzurl)
                    skipped += 1
                    if skipped > 12:
                        KibbleBit.pprint("12 skips in a row, breaking off (no more data?)")
                        break
            month -= 1            
            if month <= 0:
                month += 12
                year -= 1
        
        source['steps']['mail'] = {
            'time': time.time(),
            'status': 'Mail archives successfully scanned at ' + time.strftime("%Y/%m/%d %H:%M:%S", time.gmtime(time.time())),
            'running': False,
            'good': True
        }
        KibbleBit.updateSource(source)
    else:
        KibbleBit.pprint("Invalid Pipermail URL detected: %s" % url, True)
        source['steps']['mail'] = {
            'time': time.time(),
            'status': 'Invalid or malformed URL detected!',
            'running': False,
            'good': False
        }
        KibbleBit.updateSource(source)