Commit 5827d494 authored by Malik Koné's avatar Malik Koné
Browse files

actualisation cycles.ipynb

parent b8b1a88c
%% Cell type:code id: tags:
``` python
#Tested with Python 3.52
#-*-coding:utf-8-*-
%load_ext autoreload
%autoreload 2
#Works with Python 3.52
import os
os.sys.version
```
%% Output
The autoreload extension is already loaded. To reload it, use:
%reload_ext autoreload
'3.5.2 (default, Nov 12 2018, 13:43:14) \n[GCC 5.4.0 20160609]'
%% Cell type:code id: tags:
``` python
import graph_tool as gt
import pandas as pd
import graph_tool.draw as gtd
import matplotlib.pyplot as plt
from mlkdataprep import Replies, ExpandedReplies
import numpy as np
# obsoletes imports
#import graph_tool.centrality as gtc
#import graph_tool.generation as gtg
#import graph_tool.inference as gti
#import graph_tool.topology as gtt
#from matplotlib import cm
#import func4graph as fg
```
%% Cell type:code id: tags:
``` python
CRS_NAMES = ['villes-africaines', 'python-machine-learning', 'python-plotting']
COURSE_NAME = CRS_NAMES[2]
BKF_REP = './Data/%s-forum.mydb' % COURSE_NAME
COURSE_NAME = CRS_NAMES[1]
BKF_REP = './Database/%s-forum.mydb' % COURSE_NAME
# loading and formating the data
erep = ExpandedReplies(fn=BKF_REP)
erdf = erep.df
# ~~~~#### REPLY GROUPS ####~~~~
# grouping the messages per threads dans RGS ie.REPLY GROUPS
gkeys = erdf.p_id.apply(lambda x: x.split('/')[0])
RGS = erdf.groupby(gkeys, as_index=False)
# also group the messages per users
UGS = erdf.groupby("author_id")
```
%% Cell type:code id: tags:
``` python
erdf.head(2)
```
%% Output
p_id \
0 dsPknEY1EeeUFw7lmNnFrg
1 dsPknEY1EeeUFw7lmNnFrg/replies/KNqdRkaQEeejWg7...
p_text \
0 I am presuming this course is in Python 3? I'v...
1 We are using python 3.5. The main libraries re...
author_id p_type p_title p_comment \
0 9bb0bc1d31d1b90db50a2ac701a51879 thread_starter Python 3.x nan
1 ac9ba679ef8c933da9a07dfc17bc3e7d reply nan Hide 1 Reply
p_upvote full_names roles forum_names extras uid rolesid \
0 0 Max Russell NaN Week 1 NaN 0 0
1 0 Zijian Wang Teaching Staff Week 1 NaN 1 1
tid rid cid p_typeid humanized_dates age fid
0 0 NaN NaN 0 8 months ago 20736000 5
1 0 0.0 NaN 1 8 months ago 20736000 5
%% Cell type:code id: tags:
``` python
# #### L'ajout des liens (lignes) dans un thread
def cree_liens_pour_group(g, thread, users, rank, nb_groups, parallel=True, how='star'):
"""Adding links between users of the same thread.
- g is the graphe where the links are added
- thread is a group of message from the same thread
- users are the users ids from the network
- nb_groups is the number of threads in the network
- parallel: (True) if parallel edge should be created or not
- how: (star), prec, prec1, all, specify how to link users. see create_or_udpate_edge_in_
"""
# nombre de messages != nb d'utilisateurs dans cette discussion
nb_msg = len(thread)
# on loop sur les participants du group dans l'ordre du group
for (i, ui) in enumerate(thread.author_id):
# vertex id. On la récupère à l'aide de ui et users qui renvois le numero du noeud (nid)
vi = g.vertex(users.index(ui))
g.vp['age'][vi] = max(g.vp['age'][vi], thread.age.iloc[i])
# thread starteur que si i == 0, pour ce group
g.vp['thread_starter'][vi] += 1 if i == 0 else 0
for (j, uj) in enumerate(thread.author_id):
vj = g.vertex(users.index(uj))
edge_creation, seuil = create_or_update_edge_in_(thread, g, vi, vj, i, j, parallel, how=how)
if edge_creation and seuil:
p_type = thread.iloc[j].p_type
if p_type == 'reply':
g.vp['replies'][vi] += 1
elif p_type == 'reply_comment':
g.vp['comments'][vi] += 1
if edge_creation == "create":
# de j vers i (dans le sens de l'action) 'h' donne une info à 'i'
e = g.add_edge(vj, vi)
g.ep['weight'][e] = seuil
g.ep['age'][e] = thread.age.iloc[j]
g.ep["ptype"][e] = {'thread_starter': 'circle',
'reply_comment': 'arrow',
'reply': None}[p_type]
g.ep['upvote'][e] = thread.p_upvote.iloc[j]
g.ep['postid'][e] = thread.p_id.iloc[j]
g.ep['fid'][e] = thread.fid.iloc[j]
g.ep['tid'][e] = thread.tid.iloc[j]
# g.ep['color'][e] = TID_AGED_COLOR_MAP[thread.tid.iloc[j]]
g.ep['rid'][e] = thread.rid.iloc[j]
elif edge_creation == "update":
e = g.edge(vj, vi)
g.ep['age'][e] = np.min([thread.age.iloc[j], g.ep['age'][e]])
g.ep['weight'][e] += seuil
print('Group: %s/%s, lien (%s, %s)/%s' % (rank, nb_groups, i, j, nb_msg), end='\r')
```
%% Cell type:code id: tags:
``` python
def create_or_update_edge_in_(thread, g, vi, vj, i, j, parallel, how):
""" Dit si il faut créer un lien ou non entre deux sommets et renvois la force de ce lien.
- thread: un ensemble de message d'un même thread
- g le graph
- vi, vj, le sommet source, le sommet cible
- i, j les indices des sommet dans la list des messages thread
- parallel: faut-il créer des lien parallèle
- how: la façon de créer les lien (voir compute_strength)
"""
time_direction = i < j
no_self_edge = vi != vj
vi_in_vj_out_neighbourhood = vi in g.get_out_neighbours(vj)
no_staffs_nor_mentors = not (g.vp['staff_or_mentor'][vi] or g.vp['staff_or_mentor'][vj])
if time_direction and no_self_edge and no_staffs_nor_mentors:
seuil = compute_strength(thread, i, j, how)
if parallel or not vi_in_vj_out_neighbourhood:
return ("create", seuil)
else:
# dans voisin et non parallel
return ("update", seuil)
return (False, 0)
def compute_strength(thread, i, j, how):
"""Compute the strength between the ith an jth repliers in thread tid. this is were content evaluation would by taken in account
- how: prec: link to all preceding node in the thread, with strength depending on distance from linker
prec1 link to the last preceding node in the thread,
prec3: link to the last three preceding node in the thread
star: link only to the thread starter
all: link all previous user,
- prec1"""
strength = 0
# j est > i, max 54
if how == "precall":
strength = 1 / np.log(j - i + 1) # on ne peut avoir i = j
elif how == "prec1":
strength = 1 if j - i == 1 else 0
elif how == "prec3":
strength = 1 / np.log(j - i + 1) if j - i <= 3 else 0
elif how == "star":
strength = 1 if i == 0 else 0
elif how == "all":
strength = 1
else:
raise Exception("Choisissez un type de connexion valide")
return float(strength)
```
%% Cell type:code id: tags:
``` python
# ### SETTING UP THE GRAPH ####
IDS = erep.get_ids()
g = gt.Graph()
g.add_vertex(len(IDS['users']))
# ### SETTING UP VERTICES ####
# internal vertices properties
g.vertex_properties["names"] = g.new_vertex_property("string")
g.vertex_properties["staff_or_mentor"] = g.new_vertex_property("bool")
g.vertex_properties["role"] = g.new_vertex_property("int")
# Prefill those vertices properties
for (i, ui) in enumerate(IDS['users']):
vi = g.vertex(i)
g.vp['names'][vi] = erep.get_uname_from_uid(ui).split(' ')[0][:7]
g.vp['staff_or_mentor'][vi] = True if ui in IDS['staffs_n_mentors'] else False
g.vp['role'][vi] = UGS.get_group(ui).rolesid.iloc[0]
g.vertex_properties["groups"] = g.new_vertex_property("vector<int>")
g.vertex_properties["thread_starter"] = g.new_vertex_property("float")
g.vertex_properties["replies"] = g.new_vertex_property("float")
g.vertex_properties["comments"] = g.new_vertex_property("float")
g.vertex_properties["age"] = g.new_vertex_property("float")
g.vertex_properties['color'] = g.new_vertex_property("vector<float>")
# #### SETTING UP EDGES ####
g.edge_properties['upvote'] = g.new_edge_property("float")
g.edge_properties['ptype'] = g.new_edge_property("string")
g.edge_properties['postid'] = g.new_edge_property("string")
g.edge_properties['age'] = g.new_edge_property("float")
g.edge_properties['weight'] = g.new_edge_property('float')
g.edge_properties['fid'] = g.new_edge_property('float') # forum id
g.edge_properties['tid'] = g.new_edge_property('float')
g.edge_properties['rid'] = g.new_edge_property('float')
g.edge_properties['color'] = g.new_edge_property("vector<float>")
```
%% Cell type:code id: tags:
``` python
# penser à ajouter les lien en face des postids... dans un df et le vertex aussi
```