#!/usr/bin/env python3
# ---------------------------------------------------------------------------- #
## \file extract.py
## \author Sebastien Beaugrand
## \sa http://beaugrand.chez.com/
## \copyright CeCILL 2.1 Free Software license
# ---------------------------------------------------------------------------- #
import sys
import re
import os
# ---------------------------------------------------------------------------- #
## \fn read_list
# ---------------------------------------------------------------------------- #
def read_list(filename):
if os.path.isfile(filename):
with open(filename) as f:
return f.read().splitlines()
else:
return list()
# ---------------------------------------------------------------------------- #
## \fn read_file
# ---------------------------------------------------------------------------- #
def read_file(filename):
if os.path.isfile(filename):
with open(filename) as f:
return f.read()
else:
return ''
# ---------------------------------------------------------------------------- #
## \fn extract
# ---------------------------------------------------------------------------- #
def extract(dirname):
if not os.path.isdir(dirname):
print('{} not a directory'.format(dirname))
sys.exit(1)
dir = os.listdir(dirname)
new = list()
for filename in dir:
with open(dirname + filename, errors='ignore') as f:
try:
new.extend(
re.findall('([\w\.\-]+@[\w\.\-]+)',
f.read().replace('=\n', '')))
except UnicodeDecodeError:
print('UnicodeDecodeError: {0}'.format(filename))
except Exception as e:
print('{0}: {1}'.format(type(e), filename))
new = [x.lower() for x in new]
res = list()
for str in new:
if str.find('.') == -1:
continue
if re.search('^[0-9]', str):
continue
if re.search('\.$', str):
continue
if str.find('reply') != -1:
continue
if str.find('unsubscribe') != -1:
continue
if str.find('eGroups') != -1:
continue
if str.find('jpg@') != -1:
continue
if str.find('png@') != -1:
continue
if str.find('@phx.gbl') != -1:
continue
if str.find('@egroups.com') != -1:
continue
if str.find('@mail.gmail.com') != -1:
continue
if re.search('@.*\.prod\.', str):
continue
if re.search(str, cur):
continue
if re.search('\n' + str, sup):
continue
res.append(str)
return res
# ---------------------------------------------------------------------------- #
# main
# ---------------------------------------------------------------------------- #
if len(sys.argv) != 2:
print('Usage: {0} <dir>'.format(sys.argv[0]))
sys.exit(1)
cur = read_file('mail-pr-.list')
sup = read_file('mail-pr-.supp')
dirname = sys.argv[1]
if os.path.isdir(dirname + '/cur/'):
res = extract(dirname + '/cur/')
else:
dir = os.listdir(dirname)
res = list()
for dirname in dir:
res.extend(extract(sys.argv[1] + '/' + dirname + '/cur/'))
for m in sorted(set(res)):
print(m)