DLProxy

Repository

initial search result storing

Parent commits : fb85d65983efaab9d3d29f2f389185310bfd08b6,
Children commits :

By Laurent Defert on 2019-08-15 11:44:25
initial search result storing

Difference with parent commit fb85d65983efaab9d3d29f2f389185310bfd08b6
Files modified:
local/router.py

--- 
+++ 
@@ -8,6 +8,7 @@
 from local.cacert import cacert_download, cacert_generate
 from local.download import dl_download, download_delete, downloads_view, download_view, download_save, direct_download
 from local.index import render_index, UI_INDEX, UI_PATH
+from local.searchs import SearchResult, last_searches
 from local.search_engine import SearchEngine, search_redirect
 from local.settings import settings_view
 from local.view import api_list_view
@@ -32,7 +33,9 @@
             },
             'settings': settings_view,
             'urls': lambda req, q: api_list_view(UrlAccess, 1, req, q),
-            'search_engines': lambda req, q: api_list_view(SearchEngine, 0, req, q)
+            'search_engines': lambda req, q: api_list_view(SearchEngine, 0, req, q),
+            'search_results': lambda req, q: api_list_view(SearchResult, 1, req, q),
+            'last_searches': last_searches
         }
     }

local/search_engine.py

--- 
+++ 
@@ -1,4 +1,5 @@
-from urllib.parse import quote_plus
+import re
+import urllib.parse
 
 from defusedxml import ElementTree
 from sqlalchemy import Column, Integer, String
@@ -6,7 +7,7 @@
 
 from local.sql import Base, Url
 
- 
+
 SHORTCUTS = {
     'DuckDuckGo': '',
     'GitHub': 'gh'
@@ -52,7 +53,7 @@
                 se.html_template = elem.get('template')
             elif elem.get('type') == 'application/x-suggestions+json':
                 se.suggestion_template = elem.get('template')
-            
+
         for elem in root.findall(ns + 'Image'):
             if elem.get('height') != '16' or elem.get('width') != '16':
                 continue
@@ -71,12 +72,86 @@
 
         cls.parse_odf(db, buf)
 
+    @classmethod
+    def _get_search_terms(cls, se, url, in_url):
+        print('term matching %s vs %s' % (se, url))
+        terms_re = re.escape(se)
+
+        if in_url:
+            replace_by = '([^/]*)'
+        else:
+            replace_by = '.*'
+        terms_re = terms_re.replace('\\{searchTerms\\}', replace_by)
+
+        print('check re %s / %s' % (terms_re, url))
+        m = re.search(terms_re, url)
+
+        if m is None:
+            return None
+
+        return m.group(0)
+
+    @classmethod
+    def get_from_url(cls, db, url):
+        if url is None:
+            return
+        parsed_url = urllib.parse.urlsplit(url)
+        url_params = urllib.parse.parse_qs(parsed_url.query)
+
+        print('checking url %s for searhc' %url)
+        for se in db.query(SearchEngine).all():
+            print('against %s' % se.html_template)
+            se_url = urllib.parse.urlsplit(se.html_template)
+
+            if se_url.scheme != parsed_url.scheme or se_url.netloc != parsed_url.netloc:
+                print('scheme or netloc not matching for %s' % se.html_template)
+                continue
+
+            # Match the url
+            if '{searchTerms}' in se_url.path:
+                path = urllib.parse.unquote_plus(url)
+                terms = cls._get_search_terms(se_url.path, path, True)
+
+                if terms is None:
+                    print('search term in url not matched (se %s)' % se.html_template)
+                    continue
+                print('found search %s on %s' % (terms, se.html_template))
+                return se, terms
+            else:
+                if se_url.path != parsed_url.path:
+                    print('se %s path did not match' % se.html_template)
+                    continue
+
+            # Path matched, find search term in params
+            se_params = urllib.parse.parse_qs(se_url.query)
+            terms = None
+            for key, val in se_params.items():
+                val = val[0]
+                if '{searchTerms}' in val:
+                    if key not in url_params:
+                        print('param %s for se %s missing' % (key, se.html_template))
+                        break
+                    terms = cls._get_search_terms(val, url_params[key][0], False)
+
+                    if terms is None:
+                        print('no param %s match for se %s' % (key, se.html_template))
+                        break
+                else:
+                    if url_params.get(key) != val:
+                        print('non matching param %s = %s for se %s (%s = %s)' % (key, url_params.get(key), se.html_template, key, val))
+                        break
+            else:
+                print('matching se %s, search terms: %s' % (se.html_template, terms))
+                return se, terms
+
+            print('no se match')
+            return None
 
 def search_redirect(request, query, search_id):
     search = query.get('q', [''])[0]
     search_engine = request.db.query(SearchEngine).get(search_id)
 
-    location = search_engine.html_template.replace('{searchTerms}', quote_plus(search))
+    location = search_engine.html_template.replace('{searchTerms}', urllib.parse.quote_plus(search))
     request.send_response(302)
     request.send_header('Location', location)
     request.end_headers()

proxy2.py

--- 
+++ 
@@ -33,6 +33,7 @@
 from local.index import load_index_content, render_index
 from local.router import router
 from local.sql import Base, Url
+from local.searchs import SearchTerms, SearchResult
 from local.search_engine import SearchEngine
 
 
@@ -124,9 +125,9 @@
 
             print('path %s / %s' % (self.path, my_addr))
             print('websocket %s' % self.headers.get('Upgrade'))
+            global conf
             # Relay CONNECT's to the web ui when dev is enabled
             if (self.path.startswith(my_address()) or self.path == my_addr) and (self.command != 'CONNECT' or not conf.dev):
-                global conf
                 router.handle(self, conf)
             else:
                 self.proxy_request()
@@ -169,7 +170,22 @@
                         mime = mime.split(';', 1)[0]
                     status = res.status
 
-                UrlAccess.log(self.db, self.path, mime, self.headers['Referer'], status)
+                UrlAccess.log(self.db, self.path, mime, self.headers.get('Referer'), status)
+
+                log_search = mime == 'text/html' and self.command == 'GET'
+                log_search = log_search and status >= 200 and status < 400
+                log_search = log_search and not self.headers.get('X-Requested-With')
+                if log_search:
+                    search_match = SearchEngine.get_from_url(self.db, self.headers.get('referer'))
+
+                    if search_match is not None:
+                        se, terms = search_match
+                        terms = SearchTerms.get_or_create(self.db, terms)
+                        referer = Url.get_or_create(self.db, self.headers.get('Referer'))
+                        url = Url.get_or_create(self.db, self.path, mime)
+                        search = SearchResult(search_terms=terms, search_engine=se, url=url)
+                        self.db.add(search)
+                        self.db.commit()
 
     def render_index(self, status, *args, **kwargs):
         response = "%s %d %s\r\n" % (self.protocol_version, status.value, status.name)
@@ -219,6 +235,7 @@
         my_addr = my_addr[len('http://'):]
 
         # Relay websocket in dev mode
+        global conf
         relay = (conf.dev is True and (self.path.startswith(my_address()) or self.path == my_addr))
 
         # Relay websocket on http
@@ -264,6 +281,7 @@
 
         my_addr = my_address().rstrip('/')
         my_addr = my_addr[len('http://'):]
+        global conf
         relay_self = (conf.dev is True and (self.path.startswith(my_address()) or self.path == my_addr))
 
         if relay_self:
@@ -363,6 +381,7 @@
             if inject:
                 print('inject/content-type:', res.getheader('content-type'))
 
+            global conf
             if is_download and res.status >= 200 and res.status < 300:
                 if '/' in filename:
                     _, filename = filename.rsplit('/', 1)
@@ -467,6 +486,9 @@
         return res
 
     def _is_download(self, netloc, path, res):
+        if self.headers.get('X-Requested-With')
+            return False, None
+
         if ':' in netloc:
             netloc = netloc.split(':', 1)[0]

search/github.xml

--- 
+++ 
@@ -4,6 +4,6 @@
   <Description>Search GitHub</Description>
   <InputEncoding>UTF-8</InputEncoding>
   <Image width="16" height="16" type="image/x-icon">https://github.com/favicon.ico</Image>
-  <Url type="text/html" method="get" template="https://github.com/search?q={searchTerms}&amp;ref=opensearch"/>
+  <Url type="text/html" method="get" template="https://github.com/search?q={searchTerms}"/>
   <moz:SearchForm>https://github.com/search</moz:SearchForm>
 </OpenSearchDescription>