Following the meeting of 3th July, we need to get the researchers lists by ourselves. It would be better not to use any resource available only from inside the ENS, but only public information on the department's websites.
We need them as a TSV file (one researcher per line, tab-separated fields), with the following columns:
- Last name
- First name
- URL of the home page
- Email address
- Position (PhD student, Professor…)
- Research group
- Department
Only the names and the department are required.
Example file (some fields are left blank):
Baron-Cohen Simon http://www.psychol.cam.ac.uk/directory/simon-baron-cohen ******@cam.ac.uk Academic staff Department of Psychology
Bekinschtein Tristan http://www.psychol.cam.ac.uk/directory/tristan-bekinschtein ******@cam.ac.uk Academic staff Department of Psychology
One way to extract them automatically is to scrape the HTML of department pages. The following is an example for Institut Jean Nicod.
# -*- encoding: utf-8 -*-
from __future__ import unicode_literals
from urllib2 import urlopen, HTTPError
from httplib2 import iri2uri
from lxml import etree
from lxml.html import document_fromstring
from papers.name import *
from sys import stdout
# print etree.tostring(root)
email_re = re.compile(r'[a-zA-Z.\-_]*@[a-zA-Z.\-_]*')
url_roles = []
root_urls = [('http://www.institutnicod.org/membres/membres-statutaires/?lang=fr','Membre statutaire'),
('http://www.institutnicod.org/membres/post-doctorants-35/', 'Post-doctorant·e'),
('http://www.institutnicod.org/membres/etudiants/doctorants/?lang=fr', 'Doctorant·e')]
for rooti, role in root_urls:
f = urlopen(rooti).read()
root = document_fromstring(f)
for a in root.xpath("//li[@class='menu-entree']/a"):
url_roles.append(('http://www.institutnicod.org/'+a.get('href'), a.text, role))
#results = etree.XPath('html')
#print list(results(root))
for link, nom, role in url_roles:
try:
f = urlopen(link).read()
mdoc = document_fromstring(f)
groupe = 'Institut Jean Nicod'
email = ''
idx = 0
for elem in mdoc.xpath("//a"):
if not elem.text:
continue
if elem.text=='Contact' or elem.text == 'Email' or elem.get('href', '').strip().startswith('mailto:'):
email = elem.get('href', 'mailto:')[7:]
if elem.text=='Site Web':
link = elem.get('href')
subspan = elem.xpath("//span")
if subspan and subspan[0].text and 'site web' in subspan[0].text.lower():
link = elem.get('href')
idx += 1
first, last = parse_comma_name(nom)
print('\t'.join([last,first,link,email,role,groupe]).encode('utf-8'))
stdout.flush()
except HTTPError:
pass