in src/plugins/scanners/discourse.py [0:0]
def scanJob(KibbleBit, source, cat, creds):
""" Scans a single discourse category for activity """
NOW = int(datetime.datetime.utcnow().timestamp())
# Get $discourseURL/c/$catID
catURL = os.path.join(source['sourceURL'], "c/%s" % cat['id'])
KibbleBit.pprint("Scanning Discourse category '%s' at %s" % (cat['slug'], catURL))
page = 0
allUsers = {}
# For each paginated result (up to page 100), check for changes
while page < 100:
pcatURL = "%s?page=%u" % (catURL, page)
catjson = plugins.utils.jsonapi.get(pcatURL, auth = creds)
page += 1
if catjson:
# If we hit an empty list (no more topics), just break the loop.
if not catjson['topic_list']['topics']:
break
# First (if we have data), we should store the known users
# Since discourse hides the email (obviously!), we'll have to
# fake one to generate an account.
fakeDomain = "foo.discourse"
m = re.match(r"https?://([-a-zA-Z0-9.]+)", source['sourceURL'])
if m:
fakeDomain = m.group(1)
for user in catjson['users']:
# Fake email address, compute deterministic ID
email = "%s@%s" % (user['username'], fakeDomain)
dhash = hashlib.sha224( ("%s-%s-%s" % (source['organisation'], source['sourceURL'], email) ).encode('ascii', errors='replace')).hexdigest()
# Construct a very sparse user document
userDoc = {
'id': dhash,
'organisation': source['organisation'],
'name': user['username'],
'email': email,
}
# Store user-ID-to-username mapping for later
allUsers[user['id']] = userDoc
# Store it (or, queue storage) unless it exists.
# We don't wanna override better data, so we check if
# it's there first.
if not KibbleBit.exists('person', dhash):
KibbleBit.append('person', userDoc)
# Now, for each topic, we'll store a topic document
for topic in catjson['topic_list']['topics']:
# Calculate topic ID
dhash = hashlib.sha224( ("%s-%s-topic-%s" % (source['organisation'], source['sourceURL'], topic['id']) ).encode('ascii', errors='replace')).hexdigest()
# Figure out when topic was created and updated
CreatedDate = datetime.datetime.strptime(topic['created_at'], "%Y-%m-%dT%H:%M:%S.%fZ").timestamp()
if topic.get('last_posted_at'):
UpdatedDate = datetime.datetime.strptime(topic['last_posted_at'], "%Y-%m-%dT%H:%M:%S.%fZ").timestamp()
else:
UpdatedDate = 0
# Determine whether we should scan this topic or continue to the next one.
# We'll do this by seeing if the topic already exists and has no changes or not.
if KibbleBit.exists('forum_topic', dhash):
fdoc = KibbleBit.get('forum_topic', dhash)
# If update in the old doc was >= current update timestamp, skip the topic
if fdoc['updated'] >= UpdatedDate:
continue
# Assuming we need to scan this, start by making the base topic document
topicdoc = {
'id': dhash,
'sourceID': source['sourceID'],
'organisation': source['organisation'],
'type': 'discourse',
'category': cat['slug'],
'title': topic['title'],
'creator': allUsers[topic['posters'][0]['user_id']]['id'],
'creatorName': allUsers[topic['posters'][0]['user_id']]['name'],
'created': CreatedDate,
'createdDate': time.strftime("%Y/%m/%d %H:%M:%S", time.gmtime(CreatedDate)),
'updated': UpdatedDate,
'solved': False, # Discourse doesn't have this notion, but other forums might.
'posts': topic['posts_count'],
'views': topic['views'],
'url': source['sourceURL'] + "/t/%s/%s" % (topic['slug'], topic['id'])
}
KibbleBit.append('forum_topic', topicdoc)
KibbleBit.pprint("%s is new or changed, scanning" % topicdoc['url'])
# Now grab all the individual replies/posts
# Remember to not have it count as a visit!
pURL = "%s?track_visit=false&forceLoad=true" % topicdoc['url']
pjson = plugins.utils.jsonapi.get(pURL, auth = creds)
posts = pjson['post_stream']['posts']
# For each post/reply, construct a forum_entry document
KibbleBit.pprint("%s has %u posts" % (pURL, len(posts)))
for post in posts:
phash = hashlib.sha224( ("%s-%s-post-%s" % (source['organisation'], source['sourceURL'], post['id']) ).encode('ascii', errors='replace')).hexdigest()
uname = post.get('name', post['username']) or post['username'] # Hack to get longest non-zero value
# Find the hash of the person who posted it
# We may know them, or we may have to store them.
# If we have better info now (full name), re-store
if post['user_id'] in allUsers and allUsers[post['user_id']]['name'] == uname:
uhash = allUsers[post['user_id']]['id']
else:
# Same as before, fake email, store...
email = "%s@%s" % (post['username'], fakeDomain)
uhash = hashlib.sha224( ("%s-%s-%s" % (source['organisation'], source['sourceURL'], email) ).encode('ascii', errors='replace')).hexdigest()
# Construct a very sparse user document
userDoc = {
'id': uhash,
'organisation': source['organisation'],
'name': uname,
'email': email,
}
# Store user-ID-to-username mapping for later
allUsers[user['id']] = userDoc
# Store it (or, queue storage)
KibbleBit.append('person', userDoc)
# Get post date
CreatedDate = datetime.datetime.strptime(post['created_at'], "%Y-%m-%dT%H:%M:%S.%fZ").timestamp()
# Store the post/reply document
pdoc = {
'id': phash,
'sourceID': source['sourceID'],
'organisation': source['organisation'],
'type': 'discourse',
'creator': uhash,
'created': CreatedDate,
'createdDate': time.strftime("%Y/%m/%d %H:%M:%S", time.gmtime(CreatedDate)),
'topic': dhash,
'post_id': post['id'],
'text': post['cooked'],
'url': topicdoc['url']
}
KibbleBit.append('forum_post', pdoc)
else:
KibbleBit.pprint("Fetching discourse data failed!")
return False
return True