Commit b1f63e8b authored by Malik Koné's avatar Malik Koné

add code and update the readme file

parent 8a216c41
* Pour le tableau de bord hangout
* Pour les cycles(SNA)
- installer graph_tool
This diff is collapsed.
This diff is collapsed.
This source diff could not be displayed because it is too large. You can view the blob instead.
Ce dossier contient le code, les données d'entrée et le fichiers de sortie relatifs à la génération d'un sociogramme qui représente les interactions des utilisateurs dans un forum d'un MOOC coursera.
Prérequis:
For cycle.ipynb
- python 2.7
- graph_tool
For webscrapping see file
- python '3.6.4 (default, Sep 20 2018, 19:07:50) \n[GCC 5.4.0 20160609]'
- selenium Version: 3.141.0
- geckodriver 0.24.0 ( 2019-01-28)
- Mozilla Firefox 66.0.5
1. Télécharger les données de coursera dans dossier Data
2. Utiliser cycle.ipynb pour créer les sociogrammes
* Debug
- for scraping_to_graphe.ipynb check that you have the recent geckodriver and firefox version
- or cycle.ipynb the difficulty is the graph_tool installation. I had to compile it from source and it was difficult to have it working within a virtual env. That's the reason is using python2.7
This diff is collapsed.
This diff is collapsed.
#-*-coding:utf-8-*-
import pandas as pd
import re
import datetime as dt
class Replies():
def __init__(self, fn=None, rdf=None):
"""load a file where tuples are separated by ;ET; and element ;EE;
return a pd.DfFrame"""
if fn is not None:
with open(fn, 'r') as f:
rep = f.read()
tuples = rep.split(';ET;\n')
# removing duplicates
s = pd.Series(pd.Series(tuples).unique())
rdf = s.str.split(';EE;', expand=True)
# removing last empty column
rdf = rdf.drop(8, axis=1)
rdf.columns = ['p_id', 'p_meta','p_text', 'author_id', 'p_type',
'p_title', 'p_comment', 'p_upvote']
# if line with no post_id, remove it
if any(rdf.p_id == ''):
rdf = rdf.drop(rdf.loc[rdf.p_id == ''].index)
self.df = rdf.copy()
elif rdf is None:
raise Exception("Au moins fn ou rdf doivent être non None")
else:
# c'est que fn is none et pas rdf
self.df = rdf
def __str__(self):
return "%s" % self.df
def expand_ids(self):
eids = self.df.p_id.str.split('/', expand=True).drop([1, 3], axis=1)
eids.columns = ['threads_ids', 'replies_ids', 'comments_ids', ]
return eids
def get_Fids(self, type_id):
loc = 0
if type_id == 'thread':
loc = 'threads_ids'
elif type_id == 'reply':
loc = 'replies_ids'
elif type_id == 'comment':
loc = 'comments_ids'
else:
raise Exception(">>%s<< No such type_id." % type_id)
s = self.expand_ids().loc[:, loc].dropna().unique()
return pd.Series(s)
def get_Fusers(self):
return (self.df.author_id.unique())
def get_nb_Fcomments(self):
return len(self.get_Fids('comment'))
def get_nb_Frc(self):
return len(self.get_Fids('reply'))
def get_nb_Freplies(self):
return len(self.df) - self.get_nb_Fthreads()
def get_nb_Fthreads(self):
return len(self.get_Fids('thread'))
def get_nb_Fusers(self):
return len(self.get_Fusers())
def get_posts_with_uid(self, uid):
return self.df.loc[self.df.author_id == uid]
def get_posts_with_uname(self, uname="Malik Koné"):
return self.df[self.get_all_unames().str.contains(uname)]
def get_all_unames(self):
unames = self.expand_metas().full_names.str.strip()
return unames
def expand_metas(self):
# remettre ça dans la dfframe principale, ou s'en servir juste pour indice
pat = (r' *(?P<full_names>.*?)'
'(?P<roles>(?:(?:Teaching )?Staff|Instructor))?'
'(?P<forum_names>(?:Week|Assign|General)[^·]*?)? · '
'(?P<humanized_dates>[^·]*)'
'(?P<extras>·? Edited.*)?$'
)
return self.df.p_meta.str.extract(pat, expand=True)
class ExpandedReplies():
def __init__(self, fn=None, rdf=None):
self.rep = Replies(fn=fn, rdf=rdf)
self.df = self.rep.df.join(self.rep.expand_metas())
self.df = self.df.drop('p_meta', axis=1)
self.df.p_upvote = self.df.p_upvote.apply(coerce_upvotes)
self.df.loc[:, 'age'] = self.df.humanized_dates.apply(dehumanize_dates)
users = self.get_ids()['users']
self.df.loc[:, 'uid'] = self.df.author_id.apply(lambda x: users.index(x))
rolesIds = list(self.df.roles.unique())
self.df.loc[:, "rolesid"] = self.df.roles.apply(lambda x: rolesIds.index(x))
threadsIds = self.get_ids()['threads']
self.df.loc[:, 'tid'] = self.df.p_id.apply(lambda x: indexeur(x, threadsIds))
repliesIds = self.get_ids()['replies']
self.df.loc[:, 'rid'] = self.df.p_id.apply(lambda x: indexeur(x, repliesIds))
commentsIds = self.get_ids()['comments']
self.df.loc[:, 'cid'] = self.df.p_id.apply(lambda x: indexeur(x, commentsIds))
pTypesIds = list(self.df.p_type.unique())
self.df.loc[:, "p_typeid"] = self.df.p_type.apply(lambda x: pTypesIds.index(x))
# on règle un pb de duplicate dans les données aspiré (même entries diff dates)
good_dates = (self.df.loc[:, ["p_id", "age", "humanized_dates"]]
.groupby("p_id").aggregate(max)
.loc[:, ["humanized_dates", "age"]])
self.df = (self.df.drop(["humanized_dates", "age"], axis=1)
.drop_duplicates().dropna(axis=0, how="all")
.join(good_dates, on="p_id", how="left"))
# au niveau des noms de forums on forward fill the nans
self.df.forum_names = self.df.forum_names.ffill()
self.df.loc[:, "fid"] = self.df.groupby("forum_names").ngroup()
def __str__(self):
return "%s" % self.df
def get_staff_infos(self):
"""Return the staff names uid and roles"""
idx = self.df.roles.str.contains("Staff").dropna().index
staff_infos = self.df.loc[idx, ['author_id', 'full_names', 'roles']].drop_duplicates()
return staff_infos
def statistiques_generales(self):
m = self.df.p_type == 'thread_starter'
nb_threads = len(self.df.loc[m])
nb_posts = len(self.df)
nb_users = len(self.df.author_id.drop_duplicates())
return nb_threads, nb_posts, nb_users
def get_ids(self):
"""renvois un dictionnaires de list d'identifiants pour:
- tous les users,
- les staffs
- les mentors
- les nonStaffs
- le thread
- les replies
- les comments
"""
users = self.df.author_id.unique()
staffs = self.get_staff_infos().author_id.values
mask = self.df.full_names.str.contains('Mentor').values
mentors = self.df.iloc[mask, :].author_id.unique()
threads = self.get_pid_where("thread_starter")
replies = self.get_pid_where("reply")
comments = self.get_pid_where("reply_comment")
# threads = self.df.p_id.where(self.df.p_type == "thread_starter").dropna(how="all").unique()
# replies = self.df.p_id.where(self.df.p_type == "reply").dropna(how="all").unique()
# comments = self.df.p_id.where(self.df.p_type == "reply_comment").dropna(how="all").unique()
IDS = {
"users": list(users),
"staffs": list(staffs),
"mentors": list(mentors),
"staffs_n_mentors": list(staffs) + list(mentors),
"nonStaffs": list(set(users) - set(staffs) - set(mentors)),
"threads": list(threads),
"replies": list(replies),
"comments": list(comments),
}
return IDS
def get_pid_where(self, p_type):
return self.df.p_id.where(self.df.p_type == p_type).dropna(how="all").unique()
def get_uname_from_uid(self, uid):
mask = self.df.author_id == uid
uname = self.df.full_names.loc[mask].iloc[0]
return uname
def get_user_groups(self):
return self.df.sort_values('age').groupby("author_id")
def get_thread_groups(self):
gkeys = self.df.p_id.apply(lambda x: x.split('/')[0])
return self.df.groupby(gkeys)
def get_tid_groups(self):
return self.df.sort_values('age').groupby("tid")
def get_uid_groups(self):
return self.df.sort_values('age').groupby("uid")
def coerce_upvotes(x):
"""soit un element x de type chaine de char, on va juste récupérer l'entier qui est dedans"""
digit_rpat = re.compile(r'\d+')
trouvaille = digit_rpat.findall(x)
return int(trouvaille[0]) if trouvaille != [] else 0
def dehumanize_dates(x):
"""soit la chaine de char x, avec une date humanize (ex '12 days ago ', '20 days ago ', '22 days ago') on la déhumanise et renvois le temps en secondes"""
x.strip()
val, unit = x.split(' ')[:2]
val = 1 if 'a' in val else int(val)
if 'minute' in unit:
val = dt.timedelta(minutes=int(val))
if 'hour' in unit:
val = dt.timedelta(minutes=60*int(val))
if 'day' in unit:
val = dt.timedelta(days=int(val))
if 'month' in unit:
val = dt.timedelta(days=int(val)*30)
if 'year' in unit:
val = dt.timedelta(days=int(val)*365)
return int(val.total_seconds())
def indexeur(x, aList):
for (k, refval) in enumerate(aList):
if refval in x:
return k
return None
This diff is collapsed.
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"autoscroll": false,
"ein.hycell": false,
"ein.tags": "worksheet-0",
"slideshow": {
"slide_type": "-"
}
},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import json\n",
"import os\n",
"import time\n",
"from selenium import webdriver\n",
"from selenium.webdriver.firefox.options import Options\n",
"from selenium.webdriver.support.ui import WebDriverWait as wdw\n",
"from selenium.webdriver.common.by import By\n",
"\n",
"class VideoScrapper():\n",
" \"\"\"A class to scrap data from Coursera's forums\"\"\"\n",
" \n",
" def __init__(self, time, headless=False): \n",
" \"\"\"COURSE_NAME='python-plotting' #'python-machine-learning' ' #villes-africaines-1\"\"\"\n",
" self.BURL= \"https://www.coursera.org/learn/python-machine-learning\"\n",
"\n",
" print(\"Initialisation Driver Firefox\")\n",
" options = Options()\n",
" if headless:\n",
" options.add_argument('-headless')\n",
"\n",
" self.d = webdriver.Firefox(firefox_options=options)\n",
" self.d.implicitly_wait(time)\n",
"\n",
" print(\"Initiatialization done with Implicite waiting time set to %d\"%time)\n",
"\n",
" \n",
" def find_elt_by(self, func, criteria):\n",
" if func == 'LINK':\n",
" elt = self.d.find_element_by_link_text(criteria)\n",
" elif func == 'CSS':\n",
" elt = self.d.find_element_by_css_selector(criteria)\n",
" elif func == 'ID':\n",
" elt = self.d.find_element_by_id(criteria)\n",
" elif func == 'TAG':\n",
" elt = self.d.find_element_by_tag_name(criteria)\n",
" return elt\n",
" \n",
"\n",
"\n",
" #### Getting data from ins the threads ####\n",
"\n",
" def login(self):\n",
" \"\"\"log to coursera.\"\"\"\n",
" \n",
" MTP=\"yourmtp\"\n",
" EMAIL=\"yourmail@foo.com\"\n",
"\n",
" print(\"Logging in\")\n",
"\n",
" self.d.get(\"https://www.coursera.org/?authMode=login\")\n",
"\n",
" # Logging in \n",
" self.find_elt_by('ID', \"passwordInput-input\").clear()\n",
" self.find_elt_by('ID', \"passwordInput-input\").send_keys(MTP)\n",
" self.find_elt_by('ID', \"emailInput-input\").clear()\n",
" self.find_elt_by('ID', \"emailInput-input\").send_keys(EMAIL)\n",
" self.find_elt_by('CSS', \"button.Button_1fxeab1-o_O-primary_cv02ee-o_O-md_28awn8.w-100\").click()\n",
" \n",
" print(\"Done\")\n",
" return self.d\n",
"\n",
" def go_to(self, course=\"python-machine-learning\", url_tail=\"week/2\"):\n",
" self.d.get(\"https://www.coursera.org/learn/{}/home/{}\".format(course, url_tail) )\n",
"\n",
" def get_videos_url(self):\n",
" \"\"\"once on the week page, get the urls to download the videos lecture from\"\"\"\n",
" list_urls = \"div.od-lesson-collection-element:nth-child(1) > div:nth-child(1) > ul:nth-child(2) > li > a\"\n",
" elts = self.d.find_elements_by_css_selector(list_urls)\n",
"\n",
" # get url from the list\n",
" urls = [e.get_attribute('href') for e in elts]\n",
"\n",
" # filter only those that are lectures\n",
" self.lectures = [url for url in urls if \"lecture\" in url]\n",
"\n",
" dl_link = \"li.rc-LectureDownloadItem > a\"\n",
"\n",
" self.dwl = list()\n",
" for lec in self.lectures:\n",
" self.d.get(lec)\n",
" elt = self.d.find_element_by_css_selector(dl_link)\n",
" self.dwl.append(elt.get_attribute('href'))\n",
"\n",
" def dl_videos(self, dwl=None, lectures = None, base_fn='videos'):\n",
" \"\"\"Download the videos lectures once their urls was retreived\"\"\"\n",
" import requests\n",
"\n",
" if not dwl:\n",
" dwl = self.dwl\n",
"\n",
" if not lectures:\n",
" lectures = self.lectures\n",
"\n",
" base_path = \"/home/mlk/Vidéos/Coursera/DataScience_MachineLearning/\"\n",
" fns = [ \"%s_%s.mp4\"%(base_fn, fn.split('/')[-1]) for fn in lectures]\n",
"\n",
" print(\"Preparing to download %d files\"%(len(fns)))\n",
" for i in range(len(fns)):\n",
"\n",
" print(\"Downloading file : %s -->\"%fns[i], end=\" \")\n",
" r = requests.get(dwl[i], stream=True)\n",
" rlen = int(r.headers['Content-length'])\n",
" with open(base_path+fns[i], 'xb') as fd:\n",
" j = 0\n",
" for chunk in r.iter_content(chunk_size=128):\n",
" fd.write(chunk)\n",
" j +=1\n",
" dpercent = round(j*len(chunk)/rlen * 100)\n",
" print(\"Got {}/{} ({}%)\".format(j*len(chunk), rlen, dpercent), end='\\r')\n",
"\n",
" print(\"\\nDone\")\n",
" \n",
" print(\"Finished dowloading the %d file out of %d\"%(i+1, len(fns)))\n",
"\n",
"\n",
" def close(self):\n",
" d.close()\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"autoscroll": false,
"ein.hycell": false,
"ein.tags": "worksheet-0",
"slideshow": {
"slide_type": "-"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"ok running as main\n",
"Initialisation Driver Firefox\n",
"Initiatialization done with Implicite waiting time set to 60\n",
"Logging in\n",
"Done\n",
"Preparing to download 6 files\n",
"Got 2422512/8613284 (28%)\n",
"Done\n",
"Finished dowloading the 1 file out of 6\n",
"Got 3066426/11544098 (27%)\n",
"Done\n",
"Finished dowloading the 2 file out of 6\n",
"Got 4774570/5314291 (90%)\n",
"Done\n",
"Finished dowloading the 3 file out of 6\n",
"Got 16996224/16996224 (100%)\n",
"Done\n",
"Finished dowloading the 4 file out of 6\n",
"Got 165429/7058179 (2%)\n",
"Done\n",
"Finished dowloading the 5 file out of 6\n",
"Got 3260400/13910942 (23%)\n",
"Done\n",
"Finished dowloading the 6 file out of 6\n"
]
}
],
"source": [
"#### if run as script ####\n",
"\n",
"if __name__ == \"__main__\":\n",
" print(\"ok running as main\")\n",
" vs = VideoScrapper(60)\n",
" d = vs.login()\n",
" vs.go_to(url_tail=\"week/4\")\n",
" vs.get_videos_url()\n",
" vs.go_to(url_tail=\"\")\n",
" vs.dl_videos(base_fn=\"week4_video\")\n",
" vs.close()\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"autoscroll": false,
"ein.hycell": false,
"ein.tags": "worksheet-0",
"slideshow": {
"slide_type": "-"
}
},
"outputs": [],
"source": [
"\n",
"\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.2"
},
"name": "videoscrapping_classes2.ipynb"
},
"nbformat": 4,
"nbformat_minor": 2
}
/* Make the chart container fill the page using CSS. */
#main-chart {
position: fixed;
left: 0px;
right: 0px;
top: 2em;
bottom: 0px;
}
.axis text {
font: 10px sans-serif;
}
.axis line {
fill: none;
stroke: #000;
shape-rendering: crispEdges;
}
#circles{
clip-path: url(#clip);
}
This diff is collapsed.
#-*-coding:utf-8-*-
import json
import pandas as pd
import logging
# check These/Models/models_UML.org"""
def len_msgsll(df):
"""Compte la longueur total des messages."""
return df.loc[:, 'len_text'].sum()
def duree_conv(s):
"""Calcul la durée d'une conversation."""
older, newer = s.sort_values().iloc[[0, -1]]
return (newer - older)
def simplify_names(s):
"""Regroupe les noms de la série s en une chaine."""
rep = ''
for x in s:
if x not in rep:
rep += x
return rep
def concat_text(sgts, len_text=False):
"""Given a segment sgts return the text in one piece, including links.
mettre len_text à vrai pour avoir la longueur du text. Retourne alors un tuple (content, longueur)"""
# ajouter attachement gestion
rep = ''
longueur = 0
for sgt in sgts:
if sgt['type'] == 'TEXT':
rep += sgt['text']
longueur += len(sgt['text'])
elif sgt['type'] == 'LINE_BREAK':
rep += '\n'
elif sgt['type'] == 'LINK':
rep += ('(voir lien: %s' % sgt['text'])
if len_text:
return (rep, longueur)
return (rep,)
def load_conversations(fname='hangouts.json'):
"""Load le fichier des conversations à partir du json de hangouts.
Retourne une liste des conversations"""
fname = 'hangouts.json' if fname is None else fname
with open(fname, 'r') as f:
raw_cvs = json.load(f)['conversations']
list_cvs = [conv['conversation']['conversation'] for conv in raw_cvs]
# renvoie raw_cvs is list_cvs is None
return list_cvs
def get_discussions(fname='hangouts.json'):
"""Charges les conversations depuis un fichier de sauvegarde."""
with open(fname, 'r') as f:
dsc = json.load(f)['conversations']
return dsc
def load_conv_events(fname='hangouts.json'):
""" load the events containing the messages from the json
into a dataframe"""
fname = 'hangouts.json' if fname is None else fname
with open(fname, 'r') as f:
conversations = json.load(f)['conversations']
L = list()
for i in range(len(conversations)):
L.append(conversations[i]['events'])
conv_events = pd.Series(data=L, index=range(len(conversations)))
# conv_events.iloc[i] = conversations[i]['events']
return conv_events
def nb_type_par_conv(conversations, msg_type='REGULAR_CHAT_MESSAGE', seuil=4):
# les nombre d'events par conversation
M = list()
for i in range(len(conversations)):
evts = conversations[i]
k = 0
for y in range(len(evts)):
e = evts[y]
if e['event_type'] == msg_type:
k += 1
if k > seuil:
M.append(
((i, k))
)
M.sort(key=lambda x: x[1], reverse=True)
return M
def get_types_actions(conv_events, nb=True):
S = set()
for i in range(len(conv_events)):
for e in conv_events[i]:
S.add((e['event_type'], i)) if nb is True else S.add(e['event_type'])
L = list(S)
L.sort()
return L