Added webtech, still alpha

This commit is contained in:
Giulio 2019-04-14 16:04:33 +02:00
parent 8a0e31a66a
commit d0ba4c3b75
15 changed files with 18663 additions and 6 deletions

View File

@ -1,12 +1,20 @@
import ripe
import censys
import bong
import webtech
import sys
import json
import ripe
import censys
import bong
import webtech
import sys
import json
r = ripe.Ripe()
c = censys.Censys_WEB("dummyuser", "dummypass")
c = censys.Censys_WEB("stripped", "stripped")
b = bong.Bing()
w = webtech.WebTech(options={'json': True})
targets = r.search(sys.argv[1])
print("Found " + str(len(targets)) + " ranges from Ripe")
hosts = c.search_ipv4(c.build_query_ipv4(targets))
@ -23,5 +31,44 @@ for host in hosts:
for vhost in host_bing['vhosts']:
if vhost not in result_vhosts:
result_vhosts.append(vhost)
result.append({'ip': result_ip, 'urls': result_urls, 'vhosts': result_vhosts, 'protocols': host['protocols']})
print(json.dumps(result))
result.append({'ip': result_ip, 'urls': result_urls, 'vhosts': list(dict.fromkeys(result_vhosts)), 'protocols': host['protocols']})
print("Result has " + str(len(result)) + " entries")
final = {}
for host in result:
if "443/https" in host['protocols']:
try:
url = 'https://' + host['ip']
report = w.start_from_url(url, timeout=2)
final[url] = report
except webtech.utils.ConnectionException:
print("Site down " + url)
if "80/http" in host['protocols']:
try:
url = 'http://' + host['ip']
report = w.start_from_url('http://' + host['ip'], timeout=2)
final[url] = report
except webtech.utils.ConnectionException:
print("Site down " + url)
for vhost in host['vhosts']:
if "443/https" in host['protocols']:
try:
url = 'https://' + host['ip'] + ' (' + vhost + ')'
report = w.start_from_url(url, headers={'Host': vhost}, timeout=2)
final[url] = report
except webtech.utils.ConnectionException:
print("Site down " + url)
if "80/http" in host['protocols']:
try:
url = 'http://' + host['ip'] + ' (' + vhost + ')'
report = w.start_from_url('http://' + host['ip'], headers={'Host': vhost}, timeout=2)
final[url] = report
except webtech.utils.ConnectionException:
print("Site down " + url)
for urls in host['urls']:
try:
report = w.start_from_url(url, timeout=2)
final[url] = report
except webtech.utils.ConnectionException:
print("Site down " + url)
print(json.dumps(final, indent=4))

View File

@ -53,13 +53,17 @@ class Censys_WEB:
self.url = 'https://censys.io/'
self.username = username
self.password = password
if self.login():
self.session = self.login()
self.ipv4 = []
def login(self):
s = requests.session()
requests.get(self.url)
r = s.get(self.url + "/login")
html = BeautifulSoup(r.text, "lxml")
csrf = html.find('input', {'name': 'csrf_token'})['value']
r = s.post(self.url + "/login", data={'login': self.username, 'password': self.password, 'csrf_token': csrf, 'came_from': '/'}, allow_redirects=False)
if r.status_code != 302:
print("Wrong creds for Censys")
return s
def build_query_ipv4(self, targets):

110
webtech/.gitignore vendored Normal file
View File

@ -0,0 +1,110 @@
webtech/apps.json
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
.hypothesis/
.pytest_cache/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
target/
# Jupyter Notebook
.ipynb_checkpoints
# pyenv
.python-version
# celery beat schedule file
celerybeat-schedule
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
# editors
.vscode

View File

@ -0,0 +1,3 @@
from .webtech import WebTech
name = "webtech"

58
webtech/__main__.py Normal file
View File

@ -0,0 +1,58 @@
#!/usr/bin/env python
import sys
from optparse import OptionParser
from .__version__ import __version__ as VERSION
from .webtech import WebTech
def split_on_comma(option, opt_str, value, parser):
setattr(parser.values, option.dest, value.split(','))
def main():
"""
Main function when running from command line.
"""
parser = OptionParser(prog="webtech", version="%prog {}".format(VERSION))
parser.add_option(
"-u", "--urls",
help="url(s) to scan", type="string", action="callback", callback=split_on_comma)
parser.add_option(
"--urls-file", "--ul",
help="url(s) list file to scan", type="string")
parser.add_option(
"--user-agent", "--ua",
help="use this user agent")
parser.add_option(
"--random-user-agent", "--rua", action="store_true",
help="use a random user agent", default=False)
parser.add_option(
"--database-file", "--db",
help="custom database file")
parser.add_option(
"--json", "--oj", action="store_true",
help="output json-encoded report", default=False)
parser.add_option(
"--grep", "--og", action="store_true",
help="output grepable report", default=False)
parser.add_option(
"--update-db", "--udb", action="store_true",
help="force update of remote db files", default=False)
parser.add_option(
"--timeout", type="float", help="maximum timeout for scrape requests", default=10)
(options, _args) = parser.parse_args(sys.argv)
options = vars(options)
if options.get('urls') is None and options.get('urls_file') is None and options.get('update_db') is None:
print("No URL(s) given!")
parser.print_help()
exit()
wt = WebTech(options)
wt.start()
if __name__ == "__main__":
main()

2
webtech/__version__.py Normal file
View File

@ -0,0 +1,2 @@
# DON'T EDIT THIS FILE
__version__ = "1.2.5"

13379
webtech/apps.json Normal file

File diff suppressed because it is too large Load Diff

146
webtech/database.py Normal file
View File

@ -0,0 +1,146 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import os.path
import time
try:
from urllib.request import urlopen
from urllib.error import URLError
except ImportError as e:
from urllib2 import urlopen, URLError
from .utils import UpdateInBurpException
INSTALLATION_DIR = os.path.realpath(os.path.dirname(__file__))
DATABASE_FILE = os.path.join(INSTALLATION_DIR, "webtech.json")
WAPPALYZER_DATABASE_FILE = os.path.join(INSTALLATION_DIR, "apps.json")
WAPPALYZER_DATABASE_URL = "https://raw.githubusercontent.com/AliasIO/Wappalyzer/master/src/apps.json"
WEBTECH_DATABASE_URL = "https://raw.githubusercontent.com/ShielderSec/webtech/master/webtech/webtech.json"
DAYS = 60 * 60 * 24
def download_database_file(url, target_file):
"""
Download the database file from the WAPPPALIZER repository
"""
print("Updating database...")
response = urlopen(url)
with open(target_file, 'wb') as out_file:
out_file.write(response.read())
print("Database updated successfully!")
def save_database_file(content, target_file):
with open(target_file, 'wb') as out_file:
out_file.write(content)
print("Database updated successfully!")
def download(webfile, dbfile, name, force=False, burp=False):
"""
Check if outdated and download file
"""
now = int(time.time())
if not os.path.isfile(dbfile):
print("{} Database file not present.".format(name))
if burp:
raise UpdateInBurpException()
download_database_file(webfile, dbfile)
# set timestamp in filename
else:
last_update = int(os.path.getmtime(dbfile))
if last_update < now - 30 * DAYS or force:
if burp:
raise UpdateInBurpException()
if force:
print("Force update of {} Database file".format(name))
else:
print("{} Database file is older than 30 days.".format(name))
os.remove(dbfile)
download_database_file(webfile, dbfile)
def update_database(args=None, force=False, burp=False):
"""
Update the database if it's not present or too old
"""
try:
download(WAPPALYZER_DATABASE_URL, WAPPALYZER_DATABASE_FILE, "Wappalyzer", force=force, burp=burp)
download(WEBTECH_DATABASE_URL, DATABASE_FILE, "WebTech", force=force, burp=burp)
return True
except URLError as e:
print("Unable to update database, check your internet connection and Github.com availability.")
return False
def merge_databases(db1, db2):
"""
This helper function merge elements from two databases without overrding its elements
This function is not generic and *follow the Wappalyzer db scheme*
"""
# Wappalyzer DB format must have an apps object
db1 = db1['apps']
db2 = db2['apps']
merged_db = db1
for prop in db2:
if merged_db.get(prop) is None:
# if the element appears only in db2, add it to db1
# TODO: Validate type of db2[prop]
merged_db[prop] = db2[prop]
else:
# both db contains the same property, merge its children
element = merged_db[prop]
for key, value in db2[prop].items():
if merged_db[prop].get(key) is None:
# db1's prop doesn't have this key, add it freely
if type(value) in [str, list, dict]:
element[key] = value
else:
raise ValueError('Wrong type in database: only "dict", "list" or "str" are permitted - element of type {}'.format(type(value).__name__))
else:
# both db's prop have the same key, pretty disappointing :(
element[key] = merge_elements(merged_db[prop][key], value)
merged_db[prop] = element
return {'apps': merged_db}
def merge_elements(el1, el2):
"""
Helper function to merge 2 element of different types
Note: el2 has priority over el1 and can override it
The possible cases are:
dict & dict -> merge keys and values
list & list -> merge arrays and remove duplicates
list & str -> add str to array and remove duplicates
str & str -> make a list and remove duplicates
all other cases will raise a ValueError exception
"""
if isinstance(el1, dict):
if isinstance(el2, dict):
# merge keys and value
el1.update(el2)
return el1
else:
raise ValueError('Incompatible types when merging databases: element1 of type {}, element2 of type {}'.format(type(el1).__name__, type(el2).__name__))
elif isinstance(el1, list):
if isinstance(el2, list):
# merge arrays and remove duplicates
el1.extend(el2)
return list(set(el1))
elif isinstance(el2, str):
# add string to array and remove duplicates
el1.append(el2)
return list(set(el1))
else:
raise ValueError('Incompatible types when merging databases: element1 of type {}, element2 of type {}'.format(type(el1).__name__, type(el2).__name__))
elif isinstance(el1, str):
if isinstance(el2, str):
# make a list and remove duplicates
return list(set([el1, el2]))
else:
return merge_elements(el2, el1)
raise ValueError('Wrong type in database: only "dict", "list" or "str" are permitted - element of type {}'.format(type(el1).__name__))

9
webtech/encoder.py Normal file
View File

@ -0,0 +1,9 @@
#!/usr/bin/env python
import json
class Encoder(json.JSONEncoder):
def default(self, obj):
if isinstance(obj, set):
return list({"name": entry.name, "version": entry.version} for entry in obj)
return json.JSONEncoder.default(self, obj)

28
webtech/parser.py Normal file
View File

@ -0,0 +1,28 @@
#!/usr/bin/env python
try:
from html.parser import HTMLParser
except ImportError:
from HTMLParser import HTMLParser
# Don't blame on me for this mess, we can't use external libs and all we have is HTMLParser
class WTParser(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self.meta = {}
self.scripts = []
def handle_starttag(self, tag, attrs):
if tag == 'meta':
m = {}
for name, value in attrs:
m[name] = value
name = m.get('name') or m.get('property')
if name:
self.meta[name] = m.get('content', '')
elif tag == 'script':
for name, value in attrs:
if name == 'src':
self.scripts.append(value)
return

371
webtech/target.py Normal file
View File

@ -0,0 +1,371 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import sys
import json
import re
from io import open
# From now on, hacky hack to work on Burp Jython2.7 without external modules
BURP = False
try:
from requests import get
from requests.utils import dict_from_cookiejar
from requests.structures import CaseInsensitiveDict
from requests.exceptions import RequestException
# Disable warning about Insecure SSL
from requests.packages.urllib3 import disable_warnings
from requests.packages.urllib3.exceptions import InsecureRequestWarning
disable_warnings(InsecureRequestWarning)
except ImportError as e:
BURP = True
pass
from . import encoder
from .utils import ConnectionException, FileNotFoundException, Format, Tech, caseinsensitive_in, dict_from_caseinsensitivedict
from .parser import WTParser
# Hacky hack to hack ack. Support python2 and python3 without depending on six
if sys.version_info[0] > 2:
unicode = str
def parse_regex_string(string):
"""
Parse header string according to wappalizer DB format
strings follow the below format:
<string>[\\;version:\\\d][\\;confidence:\d]
"string" is a mandatory regex string followed by 0 or more parameters (key:value), can be empty
parameters are divided by a \\; sequence (a backslash followed by a semicolon)
examples of parameters are:
"version": indicate wich regex group store the version information
"confidence": indicate a rate of confidence
"""
parts = string.split(r"\;")
if len(parts) == 1:
return parts[0], None
else:
extra = {}
for p in parts[1:]:
p = p.split(":")
extra[p[0]] = p[1]
return parts[0], extra
class Target():
"""
This class represents a single Target (from scraping a page, from a response file, from a replayed request or from a JSON request-response exchange)
The only self attribues MUST be self.data that contains the fetched data and self.report that contains the results from various checks.response
Every function MUST do only 1 action since we are need to parallelize this and all the data must be source-independent
"""
def __init__(self):
# self.data contains the data fetched from the request
# this object SHOULD be append-only and immutable after the scraping/whitelist process
self.data = {
'url': None,
'html': None,
'headers': {},
'cookies': {},
'meta': {},
'script': {}
}
# self.report contains the information about the technologies detected
self.report = {
'tech': set(),
'headers': [],
}
def scrape_url(self, url, headers={}, cookies={}, timeout=10):
"""
Scrape the target URL and collects all the data that will be filtered afterwards
"""
if BURP:
# Burp flag is set when requests is not installed.
# When using Burp we shouldn't end up in this function so we are in a Python CLI env without requests
raise ImportError("Missing Requests module")
# By default we don't verify SSL certificates, we are only performing some useless GETs
try:
response = get(url, headers=headers, cookies=cookies, verify=False, allow_redirects=True, timeout=timeout)
except RequestException as e:
raise ConnectionException(e)
# print("status: {}".format(response.status_code))
# TODO: switch-case for various response.status_code
self.data['url'] = url
self.data['html'] = response.text
self.data['headers'] = dict_from_caseinsensitivedict(response.headers)
self.data['cookies'] = dict_from_cookiejar(response.cookies)
self.parse_html_page()
def parse_http_file(self, url):
"""
Receives an HTTP request/response file and redirect to request/response parsing
"""
path = url.replace('file://', '')
data = open(path, encoding="ISO-8859-1").read()
# e.g. HTTP/1.1 200 OK -> that's a response!
# does not check HTTP/1 since it might be HTTP/2 :)
if data.startswith("HTTP/"):
# BUG: path is not a reliable information. url matching will always fail
self.data['url'] = path
return self.parse_http_response(data)
return self.parse_http_request(data)
def parse_http_response(self, response):
"""
Parse an HTTP response file and collects all the data that will be filtered afterwards
TODO: find a better way to do this :(
"""
response = response.replace('\r', '')
headers_raw, self.data['html'] = response.split('\n\n', 1)
self.data['cookies'] = {}
for header in headers_raw.split('\n'):
header = [x.strip() for x in header.split(":", 1)]
# might be first row: HTTP/1.1 200
if len(header) != 2:
continue
if "set-cookie" in header[0].lower():
# 'Set-Cookie: dr=gonzo; path=/trmon'
cookie = [x.strip() for x in header[1].split(";", 1)[0].split("=", 1)]
# BUG: if there are cookies for different domains with the same name
# they are going to be overwritten (last occurrence will last)...
# ¯\_(ツ)_/¯
self.data['cookies'][cookie[0]] = cookie[1]
else:
self.data['headers'][header[0].lower()] = (header[1], header[0])
self.parse_html_page()
def parse_http_request(self, request, replay=True):
"""
Parse an HTTP request file and collects all the headers
TODO: find a better way to do this :(
TODO: should we support POST request?
"""
# GET / HTTP/1.1 -> /
request = request.replace('\r', '')
replay_uri = request.split('\n', 1)[0].split(" ")[1]
replay_headers = {}
replay_cookies = {}
headers_raw = request.split('\n\n', 1)[0]
for header in headers_raw.split('\n'):
header = [x.strip() for x in header.split(":", 1)]
# might be first row: GET / HTTP/1.1
if len(header) != 2:
continue
if "cookie" not in header[0].lower():
if "host" in header[0].lower():
host = header[1]
else:
replay_headers[header[0]] = header[1]
else:
# 'Cookie: dr=gonzo; mamm=ta; trmo=n'
for cookie in header[1].split(';'):
cookie = [x.strip() for x in cookie.split("=", 1)]
# BUG: if there are cookies for different domains with the same name
# they are going to be overwritten (last occurrence will last)...
# ¯\_(ツ)_/¯
replay_cookies[cookie[0]] = cookie[1]
# BUG: we don't know for sure if it's through HTTP or HTTPS
replay_url = "https://" + host + replay_uri
if replay:
self.scrape_url(replay_url, headers=replay_headers, cookies=replay_cookies)
else:
# The URL is the only usefull information when parsing a request without replaying it
self.data['url'] = replay_url
def parse_html_page(self):
"""
Parse HTML content to get meta tag and script-src
"""
p = WTParser()
p.feed(self.data['html'])
self.data['meta'] = p.meta
self.data['script'] = p.scripts
p.close()
def whitelist_data(self, common_headers):
"""
Whitelist collected data to report the important/uncommon data BEFORE matching with the database
This function is useful for CMS/technologies that are not in the database
"""
for key, value in self.data['headers'].items():
if key not in common_headers:
# In value[1] it's stored the original header name
self.report['headers'].append({"name": value[1], "value": value[0]})
def check_html(self, tech, html):
"""
Check if request html contains some database matches
"""
if isinstance(html, str) or isinstance(html, unicode):
html = [html]
for source in html:
matches = re.search(source, self.data['html'], re.IGNORECASE)
if matches is not None:
matched_tech = Tech(name=tech, version=None)
self.report['tech'].add(matched_tech)
# this tech is matched, GOTO next
return
def check_headers(self, tech, headers):
"""
Check if request headers match some database headers
"""
if not isinstance(headers, dict):
raise ValueError('Invalid headers data in database: {}'.format(headers))
# For every tech header check if there is a match in our target
for header in headers:
content = self.data['headers'].get(header.lower())
if content is None:
# Tech not found
return
else:
# Get the real content
content = content[0]
# Parse the matching regex
attr, extra = parse_regex_string(headers[header])
matches = re.search(attr, content, re.IGNORECASE)
# Attr is empty for a "generic" tech header
if attr is '' or matches is not None:
matched_tech = Tech(name=tech, version=None)
# The version extra data is present
if extra and extra['version']:
if matches.group(1):
matched_tech = matched_tech._replace(version=matches.group(1))
self.report['tech'].add(matched_tech)
# remove ALL the tech headers from the Custom Header list
# first make a list of tech headers
tech_headers = list(map(str, headers.keys()))
# then filter them in target headers case insensitively
self.report['headers'] = list(filter(lambda h: not caseinsensitive_in(str(h['name']), tech_headers), self.report['headers']))
# this tech is matched, GOTO next
return
def check_meta(self, tech, meta):
"""
Check if request meta from page's HTML contains some database matches
"""
for m in meta:
content = self.data['meta'].get(m)
# filter not-available meta
if content is None:
continue
attr, extra = parse_regex_string(meta[m])
matches = re.search(attr, content, re.IGNORECASE)
# Attr is empty for a "generic" tech meta
if attr is '' or matches is not None:
matched_tech = Tech(name=tech, version=None)
# The version extra data is present
if extra and extra['version']:
if matches.group(1):
matched_tech = matched_tech._replace(version=matches.group(1))
self.report['tech'].add(matched_tech)
# this tech is matched, GOTO next
return
def check_script(self, tech, script):
"""
Check if request script src from page's HTML contains some database matches
"""
# FIX repair to some database inconsistencies
if isinstance(script, str) or isinstance(script, unicode):
script = [script]
for source in script:
attr, extra = parse_regex_string(source)
for src in self.data['script']:
matches = re.search(attr, src, re.IGNORECASE)
# Attr is empty for a "generic" tech meta
if attr is '' or matches is not None:
matched_tech = Tech(name=tech, version=None)
# The version extra data is present
if extra and extra['version']:
if matches.group(1):
matched_tech = matched_tech._replace(version=matches.group(1))
self.report['tech'].add(matched_tech)
# this tech is matched, GOTO next
return
def check_cookies(self, tech, cookies):
"""
Check if request cookies match some database cookies
"""
for cookie in cookies:
# cookies in db are regexes so we must test them all
cookie = cookie.replace("*","") # FIX for "Fe26.2**" hapi.js cookie in the database
for biscuit in self.data['cookies'].keys():
matches = re.search(cookie, biscuit, re.IGNORECASE)
if matches is not None:
if cookies[cookie] != '':
# Let's check the cookie content
content = self.data['cookies'][biscuit]
matches = re.search(cookies[cookie], content, re.IGNORECASE)
if matches is None:
# No match, exit
return
matched_tech = Tech(name=tech, version=None)
self.report['tech'].add(matched_tech)
# this tech is matched, GOTO next
return
def check_url(self, tech, url):
"""
Check if request url match some database url rules
"""
if isinstance(url, str) or isinstance(url, unicode):
url = [url]
for source in url:
matches = re.search(source, self.data['url'], re.IGNORECASE)
if matches is not None:
matched_tech = Tech(name=tech, version=None)
self.report['tech'].add(matched_tech)
# this tech is matched, GOTO next
return
def generate_report(self, output_format):
"""
Generate a report
"""
if output_format == Format['grep']:
techs = ""
for tech in self.report['tech']:
if len(techs): techs += "//"
techs += "{}/{}".format(tech.name, 'unknown' if tech.version is None else tech.version)
headers = ""
for header in self.report['headers']:
if len(headers): headers += "//"
headers += "{}:{}".format(header["name"], header["value"])
return "Url>{}\tTechs>{}\tHeaders>{}".format(self.data['url'], techs, headers)
elif output_format == Format['json']:
return json.loads(json.dumps(self.report, cls=encoder.Encoder))
else:
retval = ""
retval += "Target URL: {}\n".format(self.data['url'])
if self.report['tech']:
retval += "Detected technologies:\n"
for tech in self.report['tech']:
retval += "\t- {} {}\n".format(tech.name, '' if tech.version is None else tech.version)
if self.report['headers']:
retval += "Detected the following interesting custom headers:\n"
for header in self.report['headers']:
retval += "\t- {}: {}\n".format(header["name"], header["value"])
return retval

4195
webtech/ua.txt Normal file

File diff suppressed because it is too large Load Diff

37
webtech/utils.py Normal file
View File

@ -0,0 +1,37 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from collections import namedtuple
try:
FileNotFoundException = FileNotFoundError
except NameError:
FileNotFoundException = IOError
Format = {
'text': 0,
'grep': 1,
'json': 2
}
Tech = namedtuple('Tech', ['name', 'version'])
class ConnectionException(Exception):
pass
class UpdateInBurpException:
pass
def caseinsensitive_in(element, elist):
"""
Given a list and an element, return true if the element is present in the list
in a case-insensitive flavor
"""
return element.lower() in map(str.lower, elist)
def dict_from_caseinsensitivedict(cidict):
# This is pretty bad, but in Python2 we don't have CaseInsensitiveDict and with Burp we cannot use requests's implementation
d = {}
for key, value in cidict.items():
d[key.lower()] = (value, key)
return d

36
webtech/webtech.json Normal file
View File

@ -0,0 +1,36 @@
{
"apps": {
"Wix": {
"cookies": {
"svSession": ""
}
},
"Google QUIC": {
"headers": {
"Alt-Svc": "quic"
}
},
"IIS": {
"headers": {
"Server": "^(?:Microsoft-)?IIS(?:/([\\d.]+))?\\;version:\\1"
}
},
"BigIP - F5": {
"headers": {
"Server": "BigIP"
}
},
"Outlook Web Access": {
"headers": {
"X-OWA-Version": "([\\d.]+)\\;version:\\1",
"X-OWA-DiagnosticsInfo": "",
"X-OWA-MinimumSupportedOWSVersion": "",
"X-OWA-OWSVersion": ""
},
"script": [
".*/([\\d.]+)/scripts/microsoft.owa\\S*.js\\;version:\\1",
".*/([\\d.]+)/scripts/owa.mail.js\\;version:\\1"
]
}
}
}

232
webtech/webtech.py Executable file
View File

@ -0,0 +1,232 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import os
import json
import random
try:
from urlparse import urlparse
except ImportError: # For Python 3
from urllib.parse import urlparse
from . import database
from .utils import Format, FileNotFoundException, ConnectionException
from .target import Target, BURP
from .__version__ import __version__ as VERSION
def default_user_agent():
return "webtech/{}".format(VERSION)
def get_random_user_agent():
"""
Get a random user agent from a file
"""
ua_file = os.path.join(os.path.realpath(os.path.dirname(__file__)), "ua.txt")
try:
with open(ua_file) as f:
agents = f.readlines()
return random.choice(agents).strip()
except FileNotFoundException as e:
print(e)
print('Please: Reinstall webtech correctly or provide a valid User-Agent list')
exit(-1)
class WebTech():
"""
Main class. The orchestrator that decides what to do.
This class is the bridge between the tech's database and the Targets' data
"""
COMMON_HEADERS = ['Accept-Ranges', 'Access-Control-Allow-Methods', 'Access-Control-Allow-Origin', 'Age', 'Cache-Control', 'Connection',
'Content-Encoding', 'Content-Language', 'Content-Length', 'Content-Security-Policy', 'Content-Type', 'Date', 'ETag', 'Expect-CT', 'Expires',
'Feature-Policy', 'Keep-Alive', 'Last-Modified', 'Link', 'Location', 'P3P', 'Pragma', 'Referrer-Policy', 'Set-Cookie',
'Strict-Transport-Security', 'Transfer-Encoding', 'Vary', 'X-Accel-Buffering', 'X-Cache', 'X-Cache-Hits', 'X-Content-Security-Policy',
'X-Content-Type-Options', 'X-Frame-Options', 'X-Timer', 'X-WebKit-CSP', 'X-XSS-Protection']
COMMON_HEADERS = [ch.lower() for ch in COMMON_HEADERS]
# 'cats' tech categories
# 'implies' website is using also this tech
# 'excludes' exclude this tech
# 'website' website for this tech
# 'icon' icon for this tech (useless)
# 'headers' check this patter in headers
# 'html' check this regex in html
# 'meta' check this patter in meta
# 'js' check this expression in javascript context
# 'cookies' check this patter in cookies
# 'script' check this pattern in scripts src
# 'url' check this patter in url
def __init__(self, options=None):
update = False if options is None else options.get('update_db', False)
success = database.update_database(force=update, burp=BURP)
self.fail = False
if not success:
# Hack for not crashing Burp
self.fail = True
return
with open(database.WAPPALYZER_DATABASE_FILE) as f:
self.db = json.load(f)
with open(database.DATABASE_FILE) as f:
self.db = database.merge_databases(self.db, json.load(f))
# Output text only
self.output_format = Format['text']
# Default user agent
self.USER_AGENT = default_user_agent()
if options is None:
return
if options.get('database_file'):
try:
with open(options.get('database_file')) as f:
self.db = database.merge_databases(self.db, json.load(f))
except (FileNotFoundException, ValueError) as e:
print(e)
exit(-1)
self.urls = options.get('urls', [])
if options.get('urls_file'):
try:
with open(options.get('urls_file')) as f:
self.urls = f.readlines()
except FileNotFoundException as e:
print(e)
exit(-1)
if options.get('user_agent'):
self.USER_AGENT = options.get('user_agent')
elif options.get('random_user_agent'):
self.USER_AGENT = get_random_user_agent()
if options.get('grep'):
# Greppable output
self.output_format = Format['grep']
elif options.get('json'):
# JSON output
self.output_format = Format['json']
try:
self.timeout = int(options.get('timeout', '10'))
except ValueError:
self.timeout = 10
def start(self):
"""
Start the engine, fetch an URL and report the findings
"""
if self.fail:
# Fail badly
exit(1)
self.output = {}
for url in self.urls:
try:
temp_output = self.start_from_url(url)
except (FileNotFoundException, ValueError) as e:
print(e)
continue
except ConnectionException as e:
print("Connection error while scanning {}".format(url))
continue
if self.output_format == Format['text']:
print(temp_output)
else:
self.output[url] = temp_output
if self.output_format == Format['json']:
print(self.output)
else:
for o in self.output.values():
print(o)
def start_from_url(self, url, headers={}, timeout=None):
"""
Start webtech on a single URL/target
Returns the report for that specific target
"""
timeout = timeout or self.timeout
target = Target()
parsed_url = urlparse(url)
if "http" in parsed_url.scheme:
# Scrape the URL by making a request
h = {'User-Agent': self.USER_AGENT}
h.update(headers)
target.scrape_url(url, headers=h, cookies={}, timeout=timeout)
elif "file" in parsed_url.scheme:
# Load the file and read it
target.parse_http_file(url)
else:
raise ValueError("Invalid scheme {} for URL {}. Only 'http', 'https' and 'file' are supported".format(parsed_url.scheme, url))
return self.perform(target)
def start_from_json(self, exchange):
"""
Start webtech on a single target from a HTTP request-response exchange as JSON serialized string
This function is the entry point for the Burp extension
"""
return self.start_from_exchange(json.loads(exchange))
def start_from_exchange(self, exchange):
"""
Start webtech on a single target from a HTTP request-response exchange as Object
"""
target = Target()
target.parse_http_response(exchange['response'])
target.parse_http_request(exchange['request'], replay=False)
return self.perform(target)
def perform(self, target):
"""
Performs all the checks on the current target received as argument
This function can be executed on multiple threads since "it doesn't access on shared data"
"""
if self.fail:
# Fail gracefully
if self.output_format == Format['json']:
return {}
else:
return ''
target.whitelist_data(self.COMMON_HEADERS)
# Cycle through all the db technologies and do all the checks
# It's more efficent cycling all technologies and match against the target once for tech
# instead of cycling each target feature against every technology
for tech in self.db["apps"]:
t = self.db["apps"][tech]
headers = t.get("headers")
html = t.get("html")
meta = t.get("meta")
cookies = t.get("cookies")
script = t.get("script")
url = t.get("url")
if headers:
target.check_headers(tech, headers)
if html:
target.check_html(tech, html)
if meta:
target.check_meta(tech, meta)
if cookies:
target.check_cookies(tech, cookies)
if script:
target.check_script(tech, script)
if url:
target.check_url(tech, url)
return target.generate_report(self.output_format)