#!/usr/bin/python
# -*- coding: utf-8 -*-
import re
import Stemmer
import cgi
import sys
import math

def sd(l):
  import math
  return math.sqrt(sum([x*x for x in l])/len(l))

ignore_threshold = 100

stop_words = [".",";",":","!","?","—","–","«","»","c","en","a","si","dit","ai","m", "mme", "ça", "est", "mlle","les","ils","et","ou","où","au","aux","avec","ce","ces","dans","de","des","du","elle","en","et","eux","il","je","la","le","leur","lui","ma","mais","me","même","mes","moi","mon","ne","nos","notre","nous","on","ou","par","pas","pour","qu","que","qui","sa","se","ses","son","sur","ta","te","tes","toi","ton","tu","un","une","vos","votre","vous","c","d","j","l","à","m","n","s","t","y","été","étée","étées","étés","étant","étante","étants","étantes","suis","es","est","sommes","êtes","sont","serai","seras","sera","serons","serez","seront","serais","serait","serions","seriez","seraient","étais","était","étions","étiez","étaient","fus","fut","fûmes","fûtes","furent","sois","soit","soyons","soyez","soient","fusse","fusses","fût","fussions","fussiez","fussent","ayant","ayante","ayantes","ayants","eu","eue","eues","eus","ai","as","avons","avez","ont","aurai","auras","aura","aurons","aurez","auront","aurais","aurait","aurions","auriez","auraient","avais","avait","avions","aviez","avaient","eut","eûmes","eûtes","eurent","aie","aies","ait","ayons","ayez","aient","eusse","eusses","eût","eussions","eussiez","eussent", "être"]

form = cgi.FieldStorage()
if "text" in form:
	string = form["text"].value
	ignore_threshold = int(form["threshold"].value)
else:
	print """Content-type: text/html; charset=utf-8

<html>
<head><title>Pleo</title>
</head>
<body><h1>Pleo</h1>
<form method="POST">
	<p><textarea name="text" cols="150" rows="40">Votre texte...</textarea></p>
	<p>Seuil de non-détection : <input type="text" name="threshold" value="100"></p>
	<input type="submit">
</form>
</body>
</html>
"""
	sys.exit(200)


np_normalized = filter(lambda e: e not in ['', 'M.', 'Mme', 'Mlle'], re.split("[ ,\n\r;\\-\\'—–«»*]", string))

noms_propres = set()

for i in range(1,len(np_normalized)):
	if np_normalized[i-1][len(np_normalized[i-1]) - 1] in [".", "?", "!", ":"]:
		continue
	if re.match(r"^[A-Z][a-zéèàçù].+$",np_normalized[i]):
		noms_propres.add(re.sub('[\\.\\!\\?]','',np_normalized[i].replace(".","")))

#for np in noms_propres: print np

#pwords = filter(lambda e: e != '' ,re.split("[ ,\n\r\\-\\'\\\".]", string))
#pwords = string.split(" ")
pwords = re.split("[ \\']",string)

words=[]
for i in range(0,len(pwords)): words.append((pwords[i],i))


index = {}
stemmer = Stemmer.Stemmer('french')

# pour le stemming, on repassera...
for (word,pos) in words:
	w = re.sub('[.,\n\r—–\\-]','',word)
	if w.find("\n") != -1 or w.lower() in stop_words or w in noms_propres: continue
	stemmed_word = stemmer.stemWord(w.lower())
	if stemmed_word not in index: index[stemmed_word] = [pos]
	else: index[stemmed_word].append(pos)


#stats = []
#nb_keys = len(index.keys())
#for word,occurrences in index.iteritems():
#	t = (word,{'occurrences':len(occurrences), 'frequency':float(len(occurrences)) / float(nb_keys)})
#	stats.append(t)

#for stat in stats: print stat

reps = []

for word,occurrences in index.iteritems():
	if len(occurrences) == 1: continue
	repetitions = []
	for i in range(1, len(occurrences)):
		distance = occurrences[i]-occurrences[i-1]
		if distance > ignore_threshold:
			if len(repetitions) > 0: reps.append((word, repetitions))
			repetitions = []
		else:
			repetitions.append((occurrences[i-1], occurrences[i], distance))
	if len(repetitions) > 0: reps.append((word, repetitions))

#for (word, repetitions) in reps: print word + " est répété : " + str(repetitions)

reps_final = []

for (word, repetitions) in reps:
	distances = [dist for (oc1, oc2, dist) in repetitions]
	min_distance = min(distances)
	gravity = max(0.0, min(1.0, math.log( 20.0*(float(len(repetitions) + 1) / min_distance)+1.0 ,15)/2.0 ))
	occurrences = [oc1 for (oc1, oc2, dist) in repetitions]
	occurrences.append(repetitions[len(repetitions)-1][1])

	reps_final.append((word, occurrences, gravity))

#for (word, occurrences, gravity) in reps_final:	print word + " est répété en positions: " + str(occurrences) + " gravité : " + str(gravity)

repetitions_occurrences = {}
for (idx,(word, occurrences, gravity)) in enumerate(reps_final):
	for occ in occurrences:
		repetitions_occurrences[occ] = (gravity,idx)


print """Content-type: text/html; charset=utf-8


<html>
<head><title>Pleo</title>
<link rel="stylesheet" type="text/css" href="../pleo.css">
<script type="text/javascript" src="../jquery-1.3.1.min.js"></script>
</head>
<body>
"""

#for word in words: print str(word) + "<br>"
#for key in index.keys(): print key + " -> " + str(index[key]) + "<br>"

print """
<div id="threshold">
<h1>Seuil de non-détection</h1>
<p>
"""

print str(ignore_threshold) + " mots."

print """
</p>
</div>
<div id="nomspropres"><h1>Noms propres:</h1>
<ul>
"""

for np in noms_propres: print "<li>" + np + "</li>"

print """
</ul>
</div>
<div id="texte">
<h1>Texte</h1>
<p>
"""

for i,w in enumerate(pwords):
	try:
		match = re.match("([^.,\n\r—–\\-]*)",w)
	except AttributeError:
		print "<span style='background:yellow'>Ooops: " + w + "</span><br>"
	w2 = match.group(0)
	if match.end(0) < len(w):
		weird_chars = w[match.end(0):].replace("\n", "</p><p>")
	else:
		weird_chars = ""
#	w2 =  w.replace("\n","</p><p>")
	if i in repetitions_occurrences:
		grav = repetitions_occurrences[i][0] * 255
		if grav > 127:
			red = 255
			blue = green = 255-int(grav)
		elif grav > 63:
			red = 255
			green = 255 - int((grav-63)*2)
			blue = 0
			#blue = 255
			#red = green = 255 - int(grav*2)
		else:
			green = 255
			red = blue = 255 - int(grav*4)
		css_class = "repetition" + str(repetitions_occurrences[i][1])
		print '<a class="' + css_class + '" style="font-weight:bold;background:rgb(' + str(red) + ',' + str(green) + ', '+str(blue)+')" onmouseover="$(\'a.'+ css_class+'\').addClass(\'selected\')" onmouseout="$(\'a.' + css_class + '\').removeClass(\'selected\')">' + w2 + '</a>' + weird_chars,
#		print '<span class="nodisplay gravity' + css_class + '">' + str(repetitions_occurrences[i][0] * 100) + '</span>';
	elif re.sub("[,.\n\r—–\\-]","",w) in noms_propres:
		print '<span style="color:red">' + w2 + '</span>' + weird_chars,
	else: print w2 + weird_chars,

print """
</p>
</div>
<div id="repetitions">
<h1>Répétitions</h1>
<ul>
"""

def sort(x, y):
	if x[2] > y[2]: return -1
	elif x[2] < y[2]: return 1
	else: return 0

for (word, occurrences, gravity) in sorted(reps_final, sort):	print "<li>" + word + " " + str(occurrences) + " gravité : " + str(gravity) + "</li>"

print """
</ul>
</div>
</body>
</html>
"""

sys.exit(200)


