in src/plugins/scanners/ponymail.py [0:0]
def scan(KibbleBit, source):
# Validate URL first
url = re.match(r"(https?://.+)/list\.html\?(.+)@(.+)", source['sourceURL'])
if not url:
KibbleBit.pprint("Malformed or invalid Pony Mail URL passed to scanner: %s" % source['sourceURL'])
source['steps']['mail'] = {
'time': time.time(),
'status': 'Could not parse Pony Mail URL!',
'running': False,
'good': False
}
KibbleBit.updateSource(source)
return
# Pony Mail requires a UI cookie in order to work. Maked sure we have one!
cookie = None
if 'creds' in source and source['creds']:
cookie = source['creds'].get('cookie', None)
if not cookie:
KibbleBit.pprint("Pony Mail instance at %s requires an authorized cookie, none found! Bailing." % source['sourceURL'])
source['steps']['mail'] = {
'time': time.time(),
'status': 'No authorized cookie found in source object.',
'running': False,
'good': False
}
KibbleBit.updateSource(source)
return
# Notify scanner and DB that this is valid and we've begun parsing
KibbleBit.pprint("%s is a valid Pony Mail address, parsing" % source['sourceURL'])
source['steps']['mail'] = {
'time': time.time(),
'status': 'Downloading Pony Mail statistics',
'running': True,
'good': True
}
KibbleBit.updateSource(source)
# Get base URL, list and domain to parse
u = url.group(1)
l = url.group(2)
d = url.group(3)
# Get this month
dt = time.gmtime(time.time())
firstYear = 1970
year = dt[0]
month = dt[1]
if month <= 0:
month += 12
year -= 1
months = 0
# Hash for keeping records of who we know
knowns = {}
# While we have older archives, continue to parse
while firstYear <= year:
statsurl = "%s/api/stats.lua?list=%s&domain=%s&d=%s" % (u, l, d, "%04u-%02u" % (year, month))
dhash = hashlib.sha224((("%s %s") % (source['organisation'], statsurl)).encode('ascii', errors='replace')).hexdigest()
found = False
if KibbleBit.exists('mailstats', dhash):
found = True
if months <= 1 or not found: # Always parse this month's stats :)
months += 1
KibbleBit.pprint("Parsing %04u-%02u" % (year, month))
KibbleBit.pprint(statsurl)
pd = datetime.date(year, month, 1).timetuple()
try:
js = plugins.utils.jsonapi.get(statsurl, cookie = cookie)
except Exception as err:
KibbleBit.pprint("Server error, skipping this month")
month -= 1
if month <= 0:
month += 12
year -= 1
continue
if 'firstYear' in js:
firstYear = js['firstYear']
#print("First Year is %u" % firstYear)
else:
KibbleBit.pprint("JSON was missing fields, aborting!")
break
replyList = repliedTo(js['emails'], js['thread_struct'])
topics = js['no_threads']
posters = {}
no_posters = 0
emails = len(js['emails'])
top10 = []
for eml in js['thread_struct']:
count = countSubs(eml, 0)
subject = ""
for reml in js['emails']:
if reml['id'] == eml['tid']:
subject = reml['subject']
break
if len(subject) > 0 and count > 0:
subject = re.sub(r"^((re|fwd|aw|fw):\s*)+", "", subject, flags=re.IGNORECASE)
subject = re.sub(r"[\r\n\t]+", "", subject, count=20)
emlid = hashlib.sha1(subject.encode('ascii', errors='replace')).hexdigest()
top10.append([emlid, subject, count])
i = 0
for top in reversed(sorted(top10, key= lambda x: x[2])):
i += 1
if i > 10:
break
KibbleBit.pprint("Found top 10: %s (%s emails)" % (top[1], top[2]))
md = time.strftime("%Y/%m/%d %H:%M:%S", pd)
mlhash = hashlib.sha224(( ("%s%s%s%s") % (top[0], source['sourceURL'], source['organisation'], md)).encode('ascii', errors='replace')).hexdigest() # one unique id per month per mail thread
jst = {
'organisation': source['organisation'],
'sourceURL': source['sourceURL'],
'sourceID': source['sourceID'],
'date': md,
'emails': top[2],
'shash': top[0],
'subject': top[1],
'ts': time.mktime(pd),
'id': mlhash
}
KibbleBit.index('mailtop', mlhash, jst)
for email in js['emails']:
sender = email['from']
name = sender
m = re.match(r"(.+)\s*<(.+)>", email['from'], flags=re.UNICODE)
if m:
name = m.group(1).replace('"', "").strip()
sender = m.group(2)
if not sender in posters:
posters[sender] = {
'name': name,
'email': sender
}
if not sender in knowns:
sid = hashlib.sha1( ("%s%s" % (source['organisation'], sender)).encode('ascii', errors='replace')).hexdigest()
if KibbleBit.exists('person',sid):
knowns[sender] = True
if not sender in knowns or name != sender:
KibbleBit.append('person',
{
'upsert': True,
'name': name,
'email': sender,
'organisation': source['organisation'],
'id' :hashlib.sha1( ("%s%s" % (source['organisation'], sender)).encode('ascii', errors='replace')).hexdigest()
})
knowns[sender] = True
replyTo = None
if email['id'] in replyList:
rt = replyList[email['id']]
for eml in js['emails']:
if eml['id'] == rt:
replyTo = getSender(eml)
print("Email was reply to %s" % sender)
jse = {
'organisation': source['organisation'],
'sourceURL': source['sourceURL'],
'sourceID': source['sourceID'],
'date': time.strftime("%Y/%m/%d %H:%M:%S", time.gmtime(email['epoch'])),
'sender': sender,
'address': sender,
'subject': email['subject'],
'replyto': replyTo,
'ts': email['epoch'],
'id': email['id'],
'upsert': True
}
KibbleBit.append('email', jse)
for sender in posters:
no_posters += 1
jso = {
'organisation': source['organisation'],
'sourceURL': source['sourceURL'],
'sourceID': source['sourceID'],
'date': time.strftime("%Y/%m/%d %H:%M:%S", pd),
'authors': no_posters,
'emails': emails,
'topics': topics
}
#print("Indexing as %s" % dhash)
KibbleBit.index('mailstats', dhash, jso)
month -= 1
if month <= 0:
month += 12
year -= 1
source['steps']['mail'] = {
'time': time.time(),
'status': 'Mail archives successfully scanned at ' + time.strftime("%Y/%m/%d %H:%M:%S", time.gmtime(time.time())),
'running': False,
'good': True
}
KibbleBit.updateSource(source)