# ----------------------------------------------------------
# AdvaS Advanced Search 
# n-gram module
#
# (C) 2002 - 2005 Frank Hofmann, Chemnitz, Germany
# email fh@efho.de
# ----------------------------------------------------------

# changed 2004-11-21

from basicLists import count_words, convert_list_into_dictionary

def get_ngrams (term, size):
	"returns n-grams of size n"

	# define empty list of n-grams
	ngrams = []

	# length of the term
	term_length = len(term)

	if (size>term_length):
		# we cannot form any n-grams - term too small for given size
		return term
	# end if
	
	# define left and right boundaries
	left = 0
	right = left + size

	while (right<=term_length):
		# extract slice and append to the list
		slice = term[left:right]
		ngrams.append(slice)
		
		# move slice to the right
		left = left + 1
		right = right + 1
	# end while

	# calculate term frequency
	dict = count_words(ngrams)

	# return ngrams = keys of the list
	return dict.keys()

def comp_ngrams (term1, term2, size):
	"compares two terms and returns their degree of equality"

	# equality of terms : Dice coefficient
	# 
	# S = 2C/(A+B)
	# 
	# S = degree of equality
	# C = n-grams contained in term 2 as well as in term 2
	# A = number of n-grams contained in term 1
	# B = number of n-grams contained in term 2

	# get n-grams for term1 and term2
	list1 = get_ngrams(term1, size)
	list2 = get_ngrams(term2, size)

	# find n-grams contained in both lists
	A = len(list1)
	B = len(list2)

	# transform both lists into dictionaries
	list1_dict = count_words(list1)
	list2_dict = count_words(list2)

	# extract the keys which appear in both list1 and list2
	list3 = filter(list1_dict.has_key, list2_dict.keys())

	# convert this list in a dictionary and count the number of keys
	dict = convert_list_into_dictionary(list3, 0)
	C = len(dict.keys())

	# calculate similarity of term 1 and 2
	S = float(float(2*C)/float(A+B))

	# return similarity
	return S

