Repository
initial search result storing
Parent commits : fb85d65983efaab9d3d29f2f389185310bfd08b6,Children commits :
By Laurent Defert on 2019-08-15 11:44:25
initial search result storing
Difference with parent commit fb85d65983efaab9d3d29f2f389185310bfd08b6
Files modified:
local/router.py
---
+++
@@ -8,6 +8,7 @@
from local.cacert import cacert_download, cacert_generate
from local.download import dl_download, download_delete, downloads_view, download_view, download_save, direct_download
from local.index import render_index, UI_INDEX, UI_PATH
+from local.searchs import SearchResult, last_searches
from local.search_engine import SearchEngine, search_redirect
from local.settings import settings_view
from local.view import api_list_view
@@ -32,7 +33,9 @@
},
'settings': settings_view,
'urls': lambda req, q: api_list_view(UrlAccess, 1, req, q),
- 'search_engines': lambda req, q: api_list_view(SearchEngine, 0, req, q)
+ 'search_engines': lambda req, q: api_list_view(SearchEngine, 0, req, q),
+ 'search_results': lambda req, q: api_list_view(SearchResult, 1, req, q),
+ 'last_searches': last_searches
}
}
local/search_engine.py
---
+++
@@ -1,4 +1,5 @@
-from urllib.parse import quote_plus
+import re
+import urllib.parse
from defusedxml import ElementTree
from sqlalchemy import Column, Integer, String
@@ -6,7 +7,7 @@
from local.sql import Base, Url
-
+
SHORTCUTS = {
'DuckDuckGo': '',
'GitHub': 'gh'
@@ -52,7 +53,7 @@
se.html_template = elem.get('template')
elif elem.get('type') == 'application/x-suggestions+json':
se.suggestion_template = elem.get('template')
-
+
for elem in root.findall(ns + 'Image'):
if elem.get('height') != '16' or elem.get('width') != '16':
continue
@@ -71,12 +72,86 @@
cls.parse_odf(db, buf)
+ @classmethod
+ def _get_search_terms(cls, se, url, in_url):
+ print('term matching %s vs %s' % (se, url))
+ terms_re = re.escape(se)
+
+ if in_url:
+ replace_by = '([^/]*)'
+ else:
+ replace_by = '.*'
+ terms_re = terms_re.replace('\\{searchTerms\\}', replace_by)
+
+ print('check re %s / %s' % (terms_re, url))
+ m = re.search(terms_re, url)
+
+ if m is None:
+ return None
+
+ return m.group(0)
+
+ @classmethod
+ def get_from_url(cls, db, url):
+ if url is None:
+ return
+ parsed_url = urllib.parse.urlsplit(url)
+ url_params = urllib.parse.parse_qs(parsed_url.query)
+
+ print('checking url %s for searhc' %url)
+ for se in db.query(SearchEngine).all():
+ print('against %s' % se.html_template)
+ se_url = urllib.parse.urlsplit(se.html_template)
+
+ if se_url.scheme != parsed_url.scheme or se_url.netloc != parsed_url.netloc:
+ print('scheme or netloc not matching for %s' % se.html_template)
+ continue
+
+ # Match the url
+ if '{searchTerms}' in se_url.path:
+ path = urllib.parse.unquote_plus(url)
+ terms = cls._get_search_terms(se_url.path, path, True)
+
+ if terms is None:
+ print('search term in url not matched (se %s)' % se.html_template)
+ continue
+ print('found search %s on %s' % (terms, se.html_template))
+ return se, terms
+ else:
+ if se_url.path != parsed_url.path:
+ print('se %s path did not match' % se.html_template)
+ continue
+
+ # Path matched, find search term in params
+ se_params = urllib.parse.parse_qs(se_url.query)
+ terms = None
+ for key, val in se_params.items():
+ val = val[0]
+ if '{searchTerms}' in val:
+ if key not in url_params:
+ print('param %s for se %s missing' % (key, se.html_template))
+ break
+ terms = cls._get_search_terms(val, url_params[key][0], False)
+
+ if terms is None:
+ print('no param %s match for se %s' % (key, se.html_template))
+ break
+ else:
+ if url_params.get(key) != val:
+ print('non matching param %s = %s for se %s (%s = %s)' % (key, url_params.get(key), se.html_template, key, val))
+ break
+ else:
+ print('matching se %s, search terms: %s' % (se.html_template, terms))
+ return se, terms
+
+ print('no se match')
+ return None
def search_redirect(request, query, search_id):
search = query.get('q', [''])[0]
search_engine = request.db.query(SearchEngine).get(search_id)
- location = search_engine.html_template.replace('{searchTerms}', quote_plus(search))
+ location = search_engine.html_template.replace('{searchTerms}', urllib.parse.quote_plus(search))
request.send_response(302)
request.send_header('Location', location)
request.end_headers()
proxy2.py
---
+++
@@ -33,6 +33,7 @@
from local.index import load_index_content, render_index
from local.router import router
from local.sql import Base, Url
+from local.searchs import SearchTerms, SearchResult
from local.search_engine import SearchEngine
@@ -124,9 +125,9 @@
print('path %s / %s' % (self.path, my_addr))
print('websocket %s' % self.headers.get('Upgrade'))
+ global conf
# Relay CONNECT's to the web ui when dev is enabled
if (self.path.startswith(my_address()) or self.path == my_addr) and (self.command != 'CONNECT' or not conf.dev):
- global conf
router.handle(self, conf)
else:
self.proxy_request()
@@ -169,7 +170,22 @@
mime = mime.split(';', 1)[0]
status = res.status
- UrlAccess.log(self.db, self.path, mime, self.headers['Referer'], status)
+ UrlAccess.log(self.db, self.path, mime, self.headers.get('Referer'), status)
+
+ log_search = mime == 'text/html' and self.command == 'GET'
+ log_search = log_search and status >= 200 and status < 400
+ log_search = log_search and not self.headers.get('X-Requested-With')
+ if log_search:
+ search_match = SearchEngine.get_from_url(self.db, self.headers.get('referer'))
+
+ if search_match is not None:
+ se, terms = search_match
+ terms = SearchTerms.get_or_create(self.db, terms)
+ referer = Url.get_or_create(self.db, self.headers.get('Referer'))
+ url = Url.get_or_create(self.db, self.path, mime)
+ search = SearchResult(search_terms=terms, search_engine=se, url=url)
+ self.db.add(search)
+ self.db.commit()
def render_index(self, status, *args, **kwargs):
response = "%s %d %s\r\n" % (self.protocol_version, status.value, status.name)
@@ -219,6 +235,7 @@
my_addr = my_addr[len('http://'):]
# Relay websocket in dev mode
+ global conf
relay = (conf.dev is True and (self.path.startswith(my_address()) or self.path == my_addr))
# Relay websocket on http
@@ -264,6 +281,7 @@
my_addr = my_address().rstrip('/')
my_addr = my_addr[len('http://'):]
+ global conf
relay_self = (conf.dev is True and (self.path.startswith(my_address()) or self.path == my_addr))
if relay_self:
@@ -363,6 +381,7 @@
if inject:
print('inject/content-type:', res.getheader('content-type'))
+ global conf
if is_download and res.status >= 200 and res.status < 300:
if '/' in filename:
_, filename = filename.rsplit('/', 1)
@@ -467,6 +486,9 @@
return res
def _is_download(self, netloc, path, res):
+ if self.headers.get('X-Requested-With')
+ return False, None
+
if ':' in netloc:
netloc = netloc.split(':', 1)[0]
search/github.xml
---
+++
@@ -4,6 +4,6 @@
<Description>Search GitHub</Description>
<InputEncoding>UTF-8</InputEncoding>
<Image width="16" height="16" type="image/x-icon">https://github.com/favicon.ico</Image>
- <Url type="text/html" method="get" template="https://github.com/search?q={searchTerms}&ref=opensearch"/>
+ <Url type="text/html" method="get" template="https://github.com/search?q={searchTerms}"/>
<moz:SearchForm>https://github.com/search</moz:SearchForm>
</OpenSearchDescription>