From 7cb7e6de9040e6f0d21390fede044200a0f1d198 Mon Sep 17 00:00:00 2001 From: Egil Moeller Date: Sun, 11 Apr 2010 02:25:39 +0200 Subject: Added an URL indexer, it currently only greps out URLs from pads and stores them in a separate, searchable table, but it doesn't actually provide a way to use this info. --- etherpad/src/plugins/urlIndexer/hooks.js | 39 ++++++++++++++++++++++++++++++++ etherpad/src/plugins/urlIndexer/main.js | 32 ++++++++++++++++++++++++++ 2 files changed, 71 insertions(+) create mode 100644 etherpad/src/plugins/urlIndexer/hooks.js create mode 100644 etherpad/src/plugins/urlIndexer/main.js (limited to 'etherpad') diff --git a/etherpad/src/plugins/urlIndexer/hooks.js b/etherpad/src/plugins/urlIndexer/hooks.js new file mode 100644 index 0000000..922150e --- /dev/null +++ b/etherpad/src/plugins/urlIndexer/hooks.js @@ -0,0 +1,39 @@ +import("etherpad.log"); +import("dispatch.{Dispatcher,PrefixMatcher,forward}"); +import("sqlbase.sqlobj"); + +REGEX_WORDCHAR = /[\u0030-\u0039\u0041-\u005A\u0061-\u007A\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u00FF\u0100-\u1FFF\u3040-\u9FFF\uF900-\uFDFF\uFE70-\uFEFE\uFF10-\uFF19\uFF21-\uFF3A\uFF41-\uFF5A\uFF66-\uFFDC]/; +REGEX_URLCHAR = new RegExp('('+/[-:@a-zA-Z0-9_.,~%+\/\\?=&#;()$]/.source+'|'+REGEX_WORDCHAR.source+')'); +REGEX_URL = new RegExp(/(?:(?:https?|s?ftp|ftps|file|smb|afp|nfs|(x-)?man|gopher|txmt):\/\/|mailto:)/.source+REGEX_URLCHAR.source+'*(?![:.,;])'+REGEX_URLCHAR.source, 'g'); + +function padModelWriteToDB(args) { + /* Update tags for the pad */ + + var new_urls = args.pad.text().match(REGEX_URL); + if (new_urls == null) new_urls = new Array(); + var new_urls_str = new_urls.join(' ') + + var old_urls_row = sqlobj.selectSingle("PAD_URL_CACHE", { PAD_ID: args.padId }); + var old_urls_str; + if (old_urls_row !== null) + old_urls_str = old_urls_row['URLS']; + else + old_urls_str = ''; + + var old_urls = old_urls_str != '' ? old_urls_str.split(' ') : new Array(); + + if (new_urls_str != old_urls_str) { + log.info({message: 'Updating urls', new_urls:new_urls, old_urls:old_urls}); + + if (old_urls_row) + sqlobj.update("PAD_URL_CACHE", {PAD_ID: args.padId }, {URLS: new_urls.join(' ')}); + else + sqlobj.insert("PAD_URL_CACHE", {PAD_ID: args.padId, URLS: new_urls.join(' ')}); + + sqlobj.deleteRows("PAD_URL", {PAD_ID: args.padId}); + + for (i = 0; i < new_urls.length; i++) { + sqlobj.insert("PAD_URL", {PAD_ID: args.padId, URL: new_urls[i]}); + } + } +} \ No newline at end of file diff --git a/etherpad/src/plugins/urlIndexer/main.js b/etherpad/src/plugins/urlIndexer/main.js new file mode 100644 index 0000000..79bb019 --- /dev/null +++ b/etherpad/src/plugins/urlIndexer/main.js @@ -0,0 +1,32 @@ +import("etherpad.log"); +import("plugins.urlIndexer.hooks"); +import("sqlbase.sqlobj"); +import("sqlbase.sqlcommon"); + +function init() { + this.hooks = ['padModelWriteToDB']; + this.description = 'Indexes URLs linked to in pads so that they can be displayed outside pads, searched for etc.'; + this.padModelWriteToDB = hooks.padModelWriteToDB; + + this.install = install; + this.uninstall = uninstall; +} + +function install() { + log.info("Installing urlIndexer"); + + sqlobj.createTable('PAD_URL', { + PAD_ID: 'varchar(128) character set utf8 collate utf8_bin not null references PAD_META(ID)', + URL: 'varchar(1024) character set utf8 collate utf8_bin not null', + }); + + sqlobj.createTable('PAD_URL_CACHE', { + PAD_ID: 'varchar(128) character set utf8 collate utf8_bin unique not null references PAD_META(ID)', + URLS: 'text collate utf8_bin not null', + }); +} + +function uninstall() { + log.info("Uninstalling urlIndexer"); +} + -- cgit v1.2.3-1-g7c22