diff options
author | Evgeny Fadeev <evgeny.fadeev@gmail.com> | 2013-02-01 14:56:33 -0300 |
---|---|---|
committer | Evgeny Fadeev <evgeny.fadeev@gmail.com> | 2013-02-01 14:56:33 -0300 |
commit | f50e8b949ae18e0bfad197a67cb2dd70ff3b2077 (patch) | |
tree | 33997db30743cefe3c01418273e22ed008642a7f /askbot/search | |
parent | 739c59b6d7b756b06a6d50a832e748c1c6c73a6a (diff) | |
download | askbot-f50e8b949ae18e0bfad197a67cb2dd70ff3b2077.tar.gz askbot-f50e8b949ae18e0bfad197a67cb2dd70ff3b2077.tar.bz2 askbot-f50e8b949ae18e0bfad197a67cb2dd70ff3b2077.zip |
some work towards multilingual search
Diffstat (limited to 'askbot/search')
-rw-r--r-- | askbot/search/postgresql/__init__.py | 32 | ||||
-rw-r--r-- | askbot/search/postgresql/thread_and_post_models_01022013.plsql | 289 |
2 files changed, 317 insertions, 4 deletions
diff --git a/askbot/search/postgresql/__init__.py b/askbot/search/postgresql/__init__.py index 4e359cc1..3edef65d 100644 --- a/askbot/search/postgresql/__init__.py +++ b/askbot/search/postgresql/__init__.py @@ -3,6 +3,26 @@ from django.db import connection from django.conf import settings as django_settings from django.utils.translation import get_language +#mapping of "django" language names to postgres +LANGUAGE_NAMES = { + 'da': 'danish', + 'nl': 'dutch', + 'en': 'english', + 'fi': 'finnish', + 'fr': 'french', + 'de': 'german', + 'hu': 'hungarian', + 'it': 'italian', + 'ja': 'japanese', + 'nb': 'norwegian', + 'pt': 'portugese', + 'ro': 'romanian', + 'ru': 'russian', + 'es': 'spanish', + 'sv': 'swedish', + 'tr': 'turkish' +} + def setup_full_text_search(script_path): """using postgresql database connection, installs the plsql language, if necessary @@ -37,17 +57,21 @@ def run_full_text_search(query_set, query_text, text_search_vector_name): rank_clause = 'ts_rank(' + table_name + \ '.' + text_search_vector_name + \ - ', plainto_tsquery(%s))' + ', plainto_tsquery(%s, %s))' where_clause = table_name + '.' + \ text_search_vector_name + \ - ' @@ plainto_tsquery(%s)' + ' @@ plainto_tsquery(%s, %s)' + + language_code = get_language() if getattr(django_settings, 'ASKBOT_MULTILINGUAL', True): - where_clause += " AND language_code='" + get_language() + "'" + where_clause += " AND " + table_name + \ + '.' + "language_code='" + language_code + "'" search_query = '&'.join(query_text.split())#apply "AND" operator - extra_params = (search_query,) + language_name = LANGUAGE_NAMES.get(language_code, 'english') + extra_params = (language_name, search_query,) extra_kwargs = { 'select': {'relevance': rank_clause}, 'where': [where_clause,], diff --git a/askbot/search/postgresql/thread_and_post_models_01022013.plsql b/askbot/search/postgresql/thread_and_post_models_01022013.plsql new file mode 100644 index 00000000..8e57ed1b --- /dev/null +++ b/askbot/search/postgresql/thread_and_post_models_01022013.plsql @@ -0,0 +1,289 @@ +/* function testing for existence of a column in a table + if table does not exists, function will return "false" */ +CREATE OR REPLACE FUNCTION column_exists(colname text, tablename text) +RETURNS boolean AS +$$ +DECLARE + q text; + onerow record; +BEGIN + + q = 'SELECT attname FROM pg_attribute WHERE attrelid = ( SELECT oid FROM pg_class WHERE relname = '''||tablename||''') AND attname = '''||colname||''''; + + FOR onerow IN EXECUTE q LOOP + RETURN true; + END LOOP; + + RETURN false; +END; +$$ LANGUAGE plpgsql; + +/* function adding tsvector column to table if it does not exists */ +CREATE OR REPLACE FUNCTION add_tsvector_column(colname text, tablename text) +RETURNS boolean AS +$$ +DECLARE + q text; +BEGIN + IF NOT column_exists(colname, tablename) THEN + q = 'ALTER TABLE ' || tablename || ' ADD COLUMN ' || colname || ' tsvector'; + EXECUTE q; + RETURN true; + ELSE + q = 'UPDATE ' || tablename || ' SET ' || colname || '=NULL'; + EXECUTE q; + RETURN false; + END IF; +END; +$$ LANGUAGE plpgsql; + +/* aggregate function that concatenates tsvectors */ +CREATE OR REPLACE FUNCTION tsv_add(tsv1 tsvector, tsv2 tsvector) +RETURNS tsvector AS +$$ +BEGIN + RETURN tsv1 || tsv2; +END; +$$ LANGUAGE plpgsql; + +CREATE OR REPLACE FUNCTION setup_aggregates() RETURNS boolean AS +$$ +DECLARE + onerow record; +BEGIN + FOR onerow IN SELECT * FROM pg_proc WHERE proname = 'concat_tsvectors' AND proisagg LOOP + DROP AGGREGATE concat_tsvectors(tsvector); + END LOOP; + CREATE AGGREGATE concat_tsvectors ( + BASETYPE = tsvector, + SFUNC = tsv_add, + STYPE = tsvector, + INITCOND = '' + ); + RETURN true; +END; +$$ LANGUAGE plpgsql; + +SELECT setup_aggregates(); + +/* language name resolution function. todo: move languace conv to python code */ +DROP FUNCTION IF EXISTS get_language_name_from_code(lang_code text); +CREATE OR REPLACE FUNCTION get_language_name_from_code(lang_code text) +RETURNS regconfig AS +$$ +BEGIN + IF lang_code = 'en' THEN + RETURN 'english'; + ELSIF lang_code = 'da' THEN + RETURN 'danish'; + ELSIF lang_code = 'nl' THEN + RETURN 'danish'; + ELSIF lang_code = 'fi' THEN + RETURN 'finnish'; + ELSIF lang_code = 'fr' THEN + RETURN 'french'; + ELSIF lang_code = 'de' THEN + RETURN 'german'; + ELSIF lang_code = 'hu' THEN + RETURN 'hungarian'; + ELSIF lang_code = 'it' THEN + RETURN 'italian'; + ELSIF lang_code = 'nb' THEN + RETURN 'norwegian'; + ELSIF lang_code = 'pt' THEN + RETURN 'portugese'; + ELSIF lang_code = 'ro' THEN + RETURN 'romanian'; + ELSIF lang_code = 'ru' THEN + RETURN 'russian'; + ELSIF lang_code = 'es' THEN + RETURN 'spanish'; + ELSIF lang_code = 'sv' THEN + RETURN 'swedish'; + ELSIF lang_code = 'tr' THEN + RETURN 'turkish'; + ELSIF lang_code = 'ja' THEN + RETURN 'japanese'; + END IF; + RETURN 'english'; +END; +$$ LANGUAGE plpgsql; + +/* calculates text search vector for the individual thread row +DOES not include question body post, answers or comments */ +DROP FUNCTION IF EXISTS get_thread_tsv(title text, tagnames text); +CREATE OR REPLACE FUNCTION get_thread_tsv(title text, tagnames text, lang_code text) +RETURNS tsvector AS +$$ +DECLARE + lang_name regconfig; +BEGIN + lang_name = get_language_name_from_code(lang_code); + /* todo add weight depending on votes */ + RETURN setweight(to_tsvector(lang_name, coalesce(title, '')), 'A') || + setweight(to_tsvector(lang_name, coalesce(tagnames, '')), 'A'); +END; +$$ LANGUAGE plpgsql; + +/* calculates text seanch vector for the individual question row */ +DROP FUNCTION IF EXISTS get_post_tsv(text text, post_type text); +CREATE OR REPLACE FUNCTION get_post_tsv(text text, post_type text, lang_code text) +RETURNS tsvector AS +$$ +DECLARE + lang_name regconfig; +BEGIN + /* todo adjust weights to reflect votes */ + lang_name = get_language_name_from_code(lang_code); + IF post_type='question' THEN + RETURN setweight(to_tsvector(lang_name, coalesce(text, '')), 'B'); + ELSIF post_type='answer' THEN + /* todo reflect whether the answer acepted or has many points */ + RETURN setweight(to_tsvector(lang_name, coalesce(text, '')), 'C'); + ELSIF post_type='comment' THEN + RETURN setweight(to_tsvector(lang_name, coalesce(text, '')), 'D'); + ELSE + RETURN to_tsvector(''); + END IF; +END; +$$ LANGUAGE plpgsql; + +/* calculates text search vector for the question body part by thread id +here we extract question title and the text by thread_id and then +calculate the text search vector. In the future question +title will be moved to the askbot_thread table and this function +will be simpler. +*/ +CREATE OR REPLACE FUNCTION get_thread_question_tsv(thread_id integer) +RETURNS tsvector AS +$$ +DECLARE + query text; + onerow record; +BEGIN + query = 'SELECT text, language_code FROM askbot_post WHERE thread_id=' || thread_id || + ' AND post_type=''question'' AND deleted=false'; + FOR onerow in EXECUTE query LOOP + RETURN get_post_tsv(onerow.text, 'question', onerow.language_code); + END LOOP; + RETURN to_tsvector(''); +END; +$$ LANGUAGE plpgsql; + +DROP FUNCTION IF EXISTS get_dependent_comments_tsv(object_id integer, tablename text); +CREATE OR REPLACE FUNCTION get_dependent_comments_tsv(parent_id integer) +RETURNS tsvector AS +$$ +DECLARE + query text; + onerow record; +BEGIN + query = 'SELECT concat_tsvectors(text_search_vector) FROM askbot_post' || + ' WHERE parent_id=' || parent_id || + ' AND post_type=''comment'' AND deleted=false'; + FOR onerow IN EXECUTE query LOOP + RETURN onerow.concat_tsvectors; + END LOOP; + RETURN to_tsvector(''); +END; +$$ LANGUAGE plpgsql; + +DROP FUNCTION IF EXISTS get_dependent_answers_tsv(question_id integer); +CREATE OR REPLACE FUNCTION get_dependent_answers_tsv(thread_id integer) +RETURNS tsvector AS +$$ +DECLARE + query text; + onerow record; +BEGIN + query = 'SELECT concat_tsvectors(text_search_vector) ' || + 'FROM askbot_post WHERE thread_id = ' || thread_id || + ' AND deleted=false'; + FOR onerow IN EXECUTE query LOOP + RETURN onerow.concat_tsvectors; + END LOOP; + RETURN to_tsvector(''); +END; +$$ LANGUAGE plpgsql; + +DROP TRIGGER IF EXISTS thread_search_vector_update_trigger on askbot_thread; +DROP TRIGGER IF EXISTS thread_search_vector_insert_trigger on askbot_thread; +DROP TRIGGER IF EXISTS post_search_vector_update_trigger on askbot_post; + +/* create tsvector columns in the content tables + need to isolate these into own transactions, b/c of a weird mix + of triggers/update and alter table statements +*/ +SELECT add_tsvector_column('text_search_vector', 'askbot_thread'); +COMMIT; +BEGIN; +SELECT add_tsvector_column('text_search_vector', 'askbot_post'); +COMMIT; +BEGIN; +SELECT add_tsvector_column('title_search_vector', 'askbot_thread'); +COMMIT; +BEGIN; + +/* one trigger per table for tsv updates */ + +/* set up update triggers */ +CREATE OR REPLACE FUNCTION thread_update_trigger() RETURNS trigger AS +$$ +DECLARE + title_tsv tsvector; +BEGIN + title_tsv = get_thread_tsv(new.title, new.tagnames, new.language_code); + new.title_search_vector = title_tsv; + new.text_search_vector = title_tsv || + get_thread_question_tsv(new.id) || + get_dependent_answers_tsv(new.id); + RETURN new; +END; +$$ LANGUAGE plpgsql; +CREATE TRIGGER thread_search_vector_update_trigger +BEFORE UPDATE ON askbot_thread FOR EACH ROW EXECUTE PROCEDURE thread_update_trigger(); + +CREATE OR REPLACE FUNCTION thread_insert_trigger() RETURNS trigger AS +$$ +BEGIN + new.text_search_vector = get_thread_tsv(new.title, new.tagnames, new.language_code); + RETURN new; +END; +$$ LANGUAGE plpgsql; +CREATE TRIGGER thread_search_vector_insert_trigger +BEFORE INSERT ON askbot_thread FOR EACH ROW EXECUTE PROCEDURE thread_insert_trigger(); + +/* post trigger */ +CREATE OR REPLACE FUNCTION post_trigger() RETURNS trigger AS +$$ +BEGIN + IF new.post_type = 'question' THEN + new.text_search_vector = get_post_tsv(new.text, 'question', new.language_code) || + get_dependent_comments_tsv(new.id); + ELSIF new.post_type = 'answer' THEN + new.text_search_vector = get_post_tsv(new.text, 'answer', new.language_code) || + get_dependent_comments_tsv(new.id); + ELSIF new.post_type = 'comment' THEN + new.text_search_vector = get_post_tsv(new.text, 'comment', new.language_code); + END IF; + UPDATE askbot_thread SET id=new.thread_id WHERE id=new.thread_id; + return new; +END; +$$ LANGUAGE plpgsql; +CREATE TRIGGER post_search_vector_update_trigger +BEFORE INSERT OR UPDATE ON askbot_post FOR EACH ROW EXECUTE PROCEDURE post_trigger(); + +/* populate tsvectors with data */ +-- post tsvectors +UPDATE askbot_post set text_search_vector = get_post_tsv(text, post_type, language_code) WHERE post_type IN ('answer', 'comment', 'question'); +UPDATE askbot_post as q SET text_search_vector = text_search_vector || + get_dependent_comments_tsv(q.id) WHERE post_type IN ('question', 'answer'); + +--thread tsvector +UPDATE askbot_thread SET text_search_vector = get_thread_tsv(title, tagnames, language_code); +UPDATE askbot_thread as t SET text_search_vector = text_search_vector || + get_dependent_answers_tsv(t.id) || + get_thread_question_tsv(t.id); +UPDATE askbot_thread SET title_search_vector = get_thread_tsv(title, tagnames, language_code); +DROP INDEX IF EXISTS askbot_search_idx; +CREATE INDEX askbot_search_idx ON askbot_thread USING gin(text_search_vector); |