372 lines
15 KiB
Python
372 lines
15 KiB
Python
|
#!/usr/bin/env python
|
||
|
# -*- coding: utf-8 -*-
|
||
|
import sys
|
||
|
import json
|
||
|
import re
|
||
|
from io import open
|
||
|
|
||
|
# From now on, hacky hack to work on Burp Jython2.7 without external modules
|
||
|
BURP = False
|
||
|
try:
|
||
|
from requests import get
|
||
|
from requests.utils import dict_from_cookiejar
|
||
|
from requests.structures import CaseInsensitiveDict
|
||
|
from requests.exceptions import RequestException
|
||
|
|
||
|
# Disable warning about Insecure SSL
|
||
|
from requests.packages.urllib3 import disable_warnings
|
||
|
from requests.packages.urllib3.exceptions import InsecureRequestWarning
|
||
|
disable_warnings(InsecureRequestWarning)
|
||
|
except ImportError as e:
|
||
|
BURP = True
|
||
|
pass
|
||
|
|
||
|
from . import encoder
|
||
|
from .utils import ConnectionException, FileNotFoundException, Format, Tech, caseinsensitive_in, dict_from_caseinsensitivedict
|
||
|
from .parser import WTParser
|
||
|
|
||
|
# Hacky hack to hack ack. Support python2 and python3 without depending on six
|
||
|
if sys.version_info[0] > 2:
|
||
|
unicode = str
|
||
|
|
||
|
|
||
|
def parse_regex_string(string):
|
||
|
"""
|
||
|
Parse header string according to wappalizer DB format
|
||
|
|
||
|
strings follow the below format:
|
||
|
<string>[\\;version:\\\d][\\;confidence:\d]
|
||
|
|
||
|
"string" is a mandatory regex string followed by 0 or more parameters (key:value), can be empty
|
||
|
parameters are divided by a \\; sequence (a backslash followed by a semicolon)
|
||
|
|
||
|
examples of parameters are:
|
||
|
"version": indicate wich regex group store the version information
|
||
|
"confidence": indicate a rate of confidence
|
||
|
"""
|
||
|
parts = string.split(r"\;")
|
||
|
if len(parts) == 1:
|
||
|
return parts[0], None
|
||
|
else:
|
||
|
extra = {}
|
||
|
for p in parts[1:]:
|
||
|
p = p.split(":")
|
||
|
extra[p[0]] = p[1]
|
||
|
return parts[0], extra
|
||
|
|
||
|
|
||
|
class Target():
|
||
|
"""
|
||
|
This class represents a single Target (from scraping a page, from a response file, from a replayed request or from a JSON request-response exchange)
|
||
|
|
||
|
The only self attribues MUST be self.data that contains the fetched data and self.report that contains the results from various checks.response
|
||
|
|
||
|
Every function MUST do only 1 action since we are need to parallelize this and all the data must be source-independent
|
||
|
"""
|
||
|
def __init__(self):
|
||
|
# self.data contains the data fetched from the request
|
||
|
# this object SHOULD be append-only and immutable after the scraping/whitelist process
|
||
|
self.data = {
|
||
|
'url': None,
|
||
|
'html': None,
|
||
|
'headers': {},
|
||
|
'cookies': {},
|
||
|
'meta': {},
|
||
|
'script': {}
|
||
|
}
|
||
|
|
||
|
# self.report contains the information about the technologies detected
|
||
|
self.report = {
|
||
|
'tech': set(),
|
||
|
'headers': [],
|
||
|
}
|
||
|
|
||
|
def scrape_url(self, url, headers={}, cookies={}, timeout=10):
|
||
|
"""
|
||
|
Scrape the target URL and collects all the data that will be filtered afterwards
|
||
|
"""
|
||
|
if BURP:
|
||
|
# Burp flag is set when requests is not installed.
|
||
|
# When using Burp we shouldn't end up in this function so we are in a Python CLI env without requests
|
||
|
raise ImportError("Missing Requests module")
|
||
|
# By default we don't verify SSL certificates, we are only performing some useless GETs
|
||
|
try:
|
||
|
response = get(url, headers=headers, cookies=cookies, verify=False, allow_redirects=True, timeout=timeout)
|
||
|
except RequestException as e:
|
||
|
raise ConnectionException(e)
|
||
|
# print("status: {}".format(response.status_code))
|
||
|
|
||
|
# TODO: switch-case for various response.status_code
|
||
|
|
||
|
self.data['url'] = url
|
||
|
self.data['html'] = response.text
|
||
|
self.data['headers'] = dict_from_caseinsensitivedict(response.headers)
|
||
|
self.data['cookies'] = dict_from_cookiejar(response.cookies)
|
||
|
self.parse_html_page()
|
||
|
|
||
|
def parse_http_file(self, url):
|
||
|
"""
|
||
|
Receives an HTTP request/response file and redirect to request/response parsing
|
||
|
"""
|
||
|
path = url.replace('file://', '')
|
||
|
data = open(path, encoding="ISO-8859-1").read()
|
||
|
|
||
|
# e.g. HTTP/1.1 200 OK -> that's a response!
|
||
|
# does not check HTTP/1 since it might be HTTP/2 :)
|
||
|
if data.startswith("HTTP/"):
|
||
|
# BUG: path is not a reliable information. url matching will always fail
|
||
|
self.data['url'] = path
|
||
|
return self.parse_http_response(data)
|
||
|
return self.parse_http_request(data)
|
||
|
|
||
|
def parse_http_response(self, response):
|
||
|
"""
|
||
|
Parse an HTTP response file and collects all the data that will be filtered afterwards
|
||
|
|
||
|
TODO: find a better way to do this :(
|
||
|
"""
|
||
|
response = response.replace('\r', '')
|
||
|
headers_raw, self.data['html'] = response.split('\n\n', 1)
|
||
|
self.data['cookies'] = {}
|
||
|
for header in headers_raw.split('\n'):
|
||
|
header = [x.strip() for x in header.split(":", 1)]
|
||
|
# might be first row: HTTP/1.1 200
|
||
|
if len(header) != 2:
|
||
|
continue
|
||
|
if "set-cookie" in header[0].lower():
|
||
|
# 'Set-Cookie: dr=gonzo; path=/trmon'
|
||
|
cookie = [x.strip() for x in header[1].split(";", 1)[0].split("=", 1)]
|
||
|
# BUG: if there are cookies for different domains with the same name
|
||
|
# they are going to be overwritten (last occurrence will last)...
|
||
|
# ¯\_(ツ)_/¯
|
||
|
self.data['cookies'][cookie[0]] = cookie[1]
|
||
|
else:
|
||
|
self.data['headers'][header[0].lower()] = (header[1], header[0])
|
||
|
|
||
|
self.parse_html_page()
|
||
|
|
||
|
def parse_http_request(self, request, replay=True):
|
||
|
"""
|
||
|
Parse an HTTP request file and collects all the headers
|
||
|
|
||
|
TODO: find a better way to do this :(
|
||
|
TODO: should we support POST request?
|
||
|
"""
|
||
|
# GET / HTTP/1.1 -> /
|
||
|
request = request.replace('\r', '')
|
||
|
replay_uri = request.split('\n', 1)[0].split(" ")[1]
|
||
|
replay_headers = {}
|
||
|
replay_cookies = {}
|
||
|
|
||
|
headers_raw = request.split('\n\n', 1)[0]
|
||
|
for header in headers_raw.split('\n'):
|
||
|
header = [x.strip() for x in header.split(":", 1)]
|
||
|
# might be first row: GET / HTTP/1.1
|
||
|
if len(header) != 2:
|
||
|
continue
|
||
|
if "cookie" not in header[0].lower():
|
||
|
if "host" in header[0].lower():
|
||
|
host = header[1]
|
||
|
else:
|
||
|
replay_headers[header[0]] = header[1]
|
||
|
else:
|
||
|
# 'Cookie: dr=gonzo; mamm=ta; trmo=n'
|
||
|
for cookie in header[1].split(';'):
|
||
|
cookie = [x.strip() for x in cookie.split("=", 1)]
|
||
|
# BUG: if there are cookies for different domains with the same name
|
||
|
# they are going to be overwritten (last occurrence will last)...
|
||
|
# ¯\_(ツ)_/¯
|
||
|
replay_cookies[cookie[0]] = cookie[1]
|
||
|
|
||
|
# BUG: we don't know for sure if it's through HTTP or HTTPS
|
||
|
replay_url = "https://" + host + replay_uri
|
||
|
if replay:
|
||
|
self.scrape_url(replay_url, headers=replay_headers, cookies=replay_cookies)
|
||
|
else:
|
||
|
# The URL is the only usefull information when parsing a request without replaying it
|
||
|
self.data['url'] = replay_url
|
||
|
|
||
|
def parse_html_page(self):
|
||
|
"""
|
||
|
Parse HTML content to get meta tag and script-src
|
||
|
"""
|
||
|
p = WTParser()
|
||
|
p.feed(self.data['html'])
|
||
|
self.data['meta'] = p.meta
|
||
|
self.data['script'] = p.scripts
|
||
|
p.close()
|
||
|
|
||
|
def whitelist_data(self, common_headers):
|
||
|
"""
|
||
|
Whitelist collected data to report the important/uncommon data BEFORE matching with the database
|
||
|
|
||
|
This function is useful for CMS/technologies that are not in the database
|
||
|
"""
|
||
|
for key, value in self.data['headers'].items():
|
||
|
if key not in common_headers:
|
||
|
# In value[1] it's stored the original header name
|
||
|
self.report['headers'].append({"name": value[1], "value": value[0]})
|
||
|
|
||
|
def check_html(self, tech, html):
|
||
|
"""
|
||
|
Check if request html contains some database matches
|
||
|
"""
|
||
|
if isinstance(html, str) or isinstance(html, unicode):
|
||
|
html = [html]
|
||
|
|
||
|
for source in html:
|
||
|
matches = re.search(source, self.data['html'], re.IGNORECASE)
|
||
|
if matches is not None:
|
||
|
matched_tech = Tech(name=tech, version=None)
|
||
|
self.report['tech'].add(matched_tech)
|
||
|
# this tech is matched, GOTO next
|
||
|
return
|
||
|
|
||
|
def check_headers(self, tech, headers):
|
||
|
"""
|
||
|
Check if request headers match some database headers
|
||
|
"""
|
||
|
if not isinstance(headers, dict):
|
||
|
raise ValueError('Invalid headers data in database: {}'.format(headers))
|
||
|
|
||
|
# For every tech header check if there is a match in our target
|
||
|
for header in headers:
|
||
|
content = self.data['headers'].get(header.lower())
|
||
|
if content is None:
|
||
|
# Tech not found
|
||
|
return
|
||
|
else:
|
||
|
# Get the real content
|
||
|
content = content[0]
|
||
|
# Parse the matching regex
|
||
|
attr, extra = parse_regex_string(headers[header])
|
||
|
matches = re.search(attr, content, re.IGNORECASE)
|
||
|
# Attr is empty for a "generic" tech header
|
||
|
if attr is '' or matches is not None:
|
||
|
matched_tech = Tech(name=tech, version=None)
|
||
|
# The version extra data is present
|
||
|
if extra and extra['version']:
|
||
|
if matches.group(1):
|
||
|
matched_tech = matched_tech._replace(version=matches.group(1))
|
||
|
self.report['tech'].add(matched_tech)
|
||
|
# remove ALL the tech headers from the Custom Header list
|
||
|
# first make a list of tech headers
|
||
|
tech_headers = list(map(str, headers.keys()))
|
||
|
# then filter them in target headers case insensitively
|
||
|
self.report['headers'] = list(filter(lambda h: not caseinsensitive_in(str(h['name']), tech_headers), self.report['headers']))
|
||
|
# this tech is matched, GOTO next
|
||
|
return
|
||
|
|
||
|
def check_meta(self, tech, meta):
|
||
|
"""
|
||
|
Check if request meta from page's HTML contains some database matches
|
||
|
"""
|
||
|
for m in meta:
|
||
|
content = self.data['meta'].get(m)
|
||
|
# filter not-available meta
|
||
|
if content is None:
|
||
|
continue
|
||
|
attr, extra = parse_regex_string(meta[m])
|
||
|
matches = re.search(attr, content, re.IGNORECASE)
|
||
|
# Attr is empty for a "generic" tech meta
|
||
|
if attr is '' or matches is not None:
|
||
|
matched_tech = Tech(name=tech, version=None)
|
||
|
# The version extra data is present
|
||
|
if extra and extra['version']:
|
||
|
if matches.group(1):
|
||
|
matched_tech = matched_tech._replace(version=matches.group(1))
|
||
|
self.report['tech'].add(matched_tech)
|
||
|
# this tech is matched, GOTO next
|
||
|
return
|
||
|
|
||
|
def check_script(self, tech, script):
|
||
|
"""
|
||
|
Check if request script src from page's HTML contains some database matches
|
||
|
"""
|
||
|
# FIX repair to some database inconsistencies
|
||
|
if isinstance(script, str) or isinstance(script, unicode):
|
||
|
script = [script]
|
||
|
|
||
|
for source in script:
|
||
|
attr, extra = parse_regex_string(source)
|
||
|
for src in self.data['script']:
|
||
|
matches = re.search(attr, src, re.IGNORECASE)
|
||
|
# Attr is empty for a "generic" tech meta
|
||
|
if attr is '' or matches is not None:
|
||
|
matched_tech = Tech(name=tech, version=None)
|
||
|
# The version extra data is present
|
||
|
if extra and extra['version']:
|
||
|
if matches.group(1):
|
||
|
matched_tech = matched_tech._replace(version=matches.group(1))
|
||
|
self.report['tech'].add(matched_tech)
|
||
|
# this tech is matched, GOTO next
|
||
|
return
|
||
|
|
||
|
def check_cookies(self, tech, cookies):
|
||
|
"""
|
||
|
Check if request cookies match some database cookies
|
||
|
"""
|
||
|
for cookie in cookies:
|
||
|
# cookies in db are regexes so we must test them all
|
||
|
cookie = cookie.replace("*","") # FIX for "Fe26.2**" hapi.js cookie in the database
|
||
|
for biscuit in self.data['cookies'].keys():
|
||
|
matches = re.search(cookie, biscuit, re.IGNORECASE)
|
||
|
if matches is not None:
|
||
|
if cookies[cookie] != '':
|
||
|
# Let's check the cookie content
|
||
|
content = self.data['cookies'][biscuit]
|
||
|
matches = re.search(cookies[cookie], content, re.IGNORECASE)
|
||
|
if matches is None:
|
||
|
# No match, exit
|
||
|
return
|
||
|
matched_tech = Tech(name=tech, version=None)
|
||
|
self.report['tech'].add(matched_tech)
|
||
|
# this tech is matched, GOTO next
|
||
|
return
|
||
|
|
||
|
def check_url(self, tech, url):
|
||
|
"""
|
||
|
Check if request url match some database url rules
|
||
|
"""
|
||
|
if isinstance(url, str) or isinstance(url, unicode):
|
||
|
url = [url]
|
||
|
|
||
|
for source in url:
|
||
|
matches = re.search(source, self.data['url'], re.IGNORECASE)
|
||
|
if matches is not None:
|
||
|
matched_tech = Tech(name=tech, version=None)
|
||
|
self.report['tech'].add(matched_tech)
|
||
|
# this tech is matched, GOTO next
|
||
|
return
|
||
|
|
||
|
def generate_report(self, output_format):
|
||
|
"""
|
||
|
Generate a report
|
||
|
"""
|
||
|
if output_format == Format['grep']:
|
||
|
techs = ""
|
||
|
for tech in self.report['tech']:
|
||
|
if len(techs): techs += "//"
|
||
|
techs += "{}/{}".format(tech.name, 'unknown' if tech.version is None else tech.version)
|
||
|
|
||
|
headers = ""
|
||
|
for header in self.report['headers']:
|
||
|
if len(headers): headers += "//"
|
||
|
headers += "{}:{}".format(header["name"], header["value"])
|
||
|
|
||
|
return "Url>{}\tTechs>{}\tHeaders>{}".format(self.data['url'], techs, headers)
|
||
|
elif output_format == Format['json']:
|
||
|
return json.loads(json.dumps(self.report, cls=encoder.Encoder))
|
||
|
else:
|
||
|
retval = ""
|
||
|
retval += "Target URL: {}\n".format(self.data['url'])
|
||
|
if self.report['tech']:
|
||
|
retval += "Detected technologies:\n"
|
||
|
for tech in self.report['tech']:
|
||
|
retval += "\t- {} {}\n".format(tech.name, '' if tech.version is None else tech.version)
|
||
|
if self.report['headers']:
|
||
|
retval += "Detected the following interesting custom headers:\n"
|
||
|
for header in self.report['headers']:
|
||
|
retval += "\t- {}: {}\n".format(header["name"], header["value"])
|
||
|
return retval
|