target.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371
  1. #!/usr/bin/env python
  2. # -*- coding: utf-8 -*-
  3. import sys
  4. import json
  5. import re
  6. from io import open
  7. # From now on, hacky hack to work on Burp Jython2.7 without external modules
  8. BURP = False
  9. try:
  10. from requests import get
  11. from requests.utils import dict_from_cookiejar
  12. from requests.structures import CaseInsensitiveDict
  13. from requests.exceptions import RequestException
  14. # Disable warning about Insecure SSL
  15. from requests.packages.urllib3 import disable_warnings
  16. from requests.packages.urllib3.exceptions import InsecureRequestWarning
  17. disable_warnings(InsecureRequestWarning)
  18. except ImportError as e:
  19. BURP = True
  20. pass
  21. from . import encoder
  22. from .utils import ConnectionException, FileNotFoundException, Format, Tech, caseinsensitive_in, dict_from_caseinsensitivedict
  23. from .parser import WTParser
  24. # Hacky hack to hack ack. Support python2 and python3 without depending on six
  25. if sys.version_info[0] > 2:
  26. unicode = str
  27. def parse_regex_string(string):
  28. """
  29. Parse header string according to wappalizer DB format
  30. strings follow the below format:
  31. <string>[\\;version:\\\d][\\;confidence:\d]
  32. "string" is a mandatory regex string followed by 0 or more parameters (key:value), can be empty
  33. parameters are divided by a \\; sequence (a backslash followed by a semicolon)
  34. examples of parameters are:
  35. "version": indicate wich regex group store the version information
  36. "confidence": indicate a rate of confidence
  37. """
  38. parts = string.split(r"\;")
  39. if len(parts) == 1:
  40. return parts[0], None
  41. else:
  42. extra = {}
  43. for p in parts[1:]:
  44. p = p.split(":")
  45. extra[p[0]] = p[1]
  46. return parts[0], extra
  47. class Target():
  48. """
  49. This class represents a single Target (from scraping a page, from a response file, from a replayed request or from a JSON request-response exchange)
  50. The only self attribues MUST be self.data that contains the fetched data and self.report that contains the results from various checks.response
  51. Every function MUST do only 1 action since we are need to parallelize this and all the data must be source-independent
  52. """
  53. def __init__(self):
  54. # self.data contains the data fetched from the request
  55. # this object SHOULD be append-only and immutable after the scraping/whitelist process
  56. self.data = {
  57. 'url': None,
  58. 'html': None,
  59. 'headers': {},
  60. 'cookies': {},
  61. 'meta': {},
  62. 'script': {}
  63. }
  64. # self.report contains the information about the technologies detected
  65. self.report = {
  66. 'tech': set(),
  67. 'headers': [],
  68. }
  69. def scrape_url(self, url, headers={}, cookies={}, timeout=10):
  70. """
  71. Scrape the target URL and collects all the data that will be filtered afterwards
  72. """
  73. if BURP:
  74. # Burp flag is set when requests is not installed.
  75. # When using Burp we shouldn't end up in this function so we are in a Python CLI env without requests
  76. raise ImportError("Missing Requests module")
  77. # By default we don't verify SSL certificates, we are only performing some useless GETs
  78. try:
  79. response = get(url, headers=headers, cookies=cookies, verify=False, allow_redirects=True, timeout=timeout)
  80. except RequestException as e:
  81. raise ConnectionException(e)
  82. # print("status: {}".format(response.status_code))
  83. # TODO: switch-case for various response.status_code
  84. self.data['url'] = url
  85. self.data['html'] = response.text
  86. self.data['headers'] = dict_from_caseinsensitivedict(response.headers)
  87. self.data['cookies'] = dict_from_cookiejar(response.cookies)
  88. self.parse_html_page()
  89. def parse_http_file(self, url):
  90. """
  91. Receives an HTTP request/response file and redirect to request/response parsing
  92. """
  93. path = url.replace('file://', '')
  94. data = open(path, encoding="ISO-8859-1").read()
  95. # e.g. HTTP/1.1 200 OK -> that's a response!
  96. # does not check HTTP/1 since it might be HTTP/2 :)
  97. if data.startswith("HTTP/"):
  98. # BUG: path is not a reliable information. url matching will always fail
  99. self.data['url'] = path
  100. return self.parse_http_response(data)
  101. return self.parse_http_request(data)
  102. def parse_http_response(self, response):
  103. """
  104. Parse an HTTP response file and collects all the data that will be filtered afterwards
  105. TODO: find a better way to do this :(
  106. """
  107. response = response.replace('\r', '')
  108. headers_raw, self.data['html'] = response.split('\n\n', 1)
  109. self.data['cookies'] = {}
  110. for header in headers_raw.split('\n'):
  111. header = [x.strip() for x in header.split(":", 1)]
  112. # might be first row: HTTP/1.1 200
  113. if len(header) != 2:
  114. continue
  115. if "set-cookie" in header[0].lower():
  116. # 'Set-Cookie: dr=gonzo; path=/trmon'
  117. cookie = [x.strip() for x in header[1].split(";", 1)[0].split("=", 1)]
  118. # BUG: if there are cookies for different domains with the same name
  119. # they are going to be overwritten (last occurrence will last)...
  120. # ¯\_(ツ)_/¯
  121. self.data['cookies'][cookie[0]] = cookie[1]
  122. else:
  123. self.data['headers'][header[0].lower()] = (header[1], header[0])
  124. self.parse_html_page()
  125. def parse_http_request(self, request, replay=True):
  126. """
  127. Parse an HTTP request file and collects all the headers
  128. TODO: find a better way to do this :(
  129. TODO: should we support POST request?
  130. """
  131. # GET / HTTP/1.1 -> /
  132. request = request.replace('\r', '')
  133. replay_uri = request.split('\n', 1)[0].split(" ")[1]
  134. replay_headers = {}
  135. replay_cookies = {}
  136. headers_raw = request.split('\n\n', 1)[0]
  137. for header in headers_raw.split('\n'):
  138. header = [x.strip() for x in header.split(":", 1)]
  139. # might be first row: GET / HTTP/1.1
  140. if len(header) != 2:
  141. continue
  142. if "cookie" not in header[0].lower():
  143. if "host" in header[0].lower():
  144. host = header[1]
  145. else:
  146. replay_headers[header[0]] = header[1]
  147. else:
  148. # 'Cookie: dr=gonzo; mamm=ta; trmo=n'
  149. for cookie in header[1].split(';'):
  150. cookie = [x.strip() for x in cookie.split("=", 1)]
  151. # BUG: if there are cookies for different domains with the same name
  152. # they are going to be overwritten (last occurrence will last)...
  153. # ¯\_(ツ)_/¯
  154. replay_cookies[cookie[0]] = cookie[1]
  155. # BUG: we don't know for sure if it's through HTTP or HTTPS
  156. replay_url = "https://" + host + replay_uri
  157. if replay:
  158. self.scrape_url(replay_url, headers=replay_headers, cookies=replay_cookies)
  159. else:
  160. # The URL is the only usefull information when parsing a request without replaying it
  161. self.data['url'] = replay_url
  162. def parse_html_page(self):
  163. """
  164. Parse HTML content to get meta tag and script-src
  165. """
  166. p = WTParser()
  167. p.feed(self.data['html'])
  168. self.data['meta'] = p.meta
  169. self.data['script'] = p.scripts
  170. p.close()
  171. def whitelist_data(self, common_headers):
  172. """
  173. Whitelist collected data to report the important/uncommon data BEFORE matching with the database
  174. This function is useful for CMS/technologies that are not in the database
  175. """
  176. for key, value in self.data['headers'].items():
  177. if key not in common_headers:
  178. # In value[1] it's stored the original header name
  179. self.report['headers'].append({"name": value[1], "value": value[0]})
  180. def check_html(self, tech, html):
  181. """
  182. Check if request html contains some database matches
  183. """
  184. if isinstance(html, str) or isinstance(html, unicode):
  185. html = [html]
  186. for source in html:
  187. matches = re.search(source, self.data['html'], re.IGNORECASE)
  188. if matches is not None:
  189. matched_tech = Tech(name=tech, version=None)
  190. self.report['tech'].add(matched_tech)
  191. # this tech is matched, GOTO next
  192. return
  193. def check_headers(self, tech, headers):
  194. """
  195. Check if request headers match some database headers
  196. """
  197. if not isinstance(headers, dict):
  198. raise ValueError('Invalid headers data in database: {}'.format(headers))
  199. # For every tech header check if there is a match in our target
  200. for header in headers:
  201. content = self.data['headers'].get(header.lower())
  202. if content is None:
  203. # Tech not found
  204. return
  205. else:
  206. # Get the real content
  207. content = content[0]
  208. # Parse the matching regex
  209. attr, extra = parse_regex_string(headers[header])
  210. matches = re.search(attr, content, re.IGNORECASE)
  211. # Attr is empty for a "generic" tech header
  212. if attr is '' or matches is not None:
  213. matched_tech = Tech(name=tech, version=None)
  214. # The version extra data is present
  215. if extra and extra['version']:
  216. if matches.group(1):
  217. matched_tech = matched_tech._replace(version=matches.group(1))
  218. self.report['tech'].add(matched_tech)
  219. # remove ALL the tech headers from the Custom Header list
  220. # first make a list of tech headers
  221. tech_headers = list(map(str, headers.keys()))
  222. # then filter them in target headers case insensitively
  223. self.report['headers'] = list(filter(lambda h: not caseinsensitive_in(str(h['name']), tech_headers), self.report['headers']))
  224. # this tech is matched, GOTO next
  225. return
  226. def check_meta(self, tech, meta):
  227. """
  228. Check if request meta from page's HTML contains some database matches
  229. """
  230. for m in meta:
  231. content = self.data['meta'].get(m)
  232. # filter not-available meta
  233. if content is None:
  234. continue
  235. attr, extra = parse_regex_string(meta[m])
  236. matches = re.search(attr, content, re.IGNORECASE)
  237. # Attr is empty for a "generic" tech meta
  238. if attr is '' or matches is not None:
  239. matched_tech = Tech(name=tech, version=None)
  240. # The version extra data is present
  241. if extra and extra['version']:
  242. if matches.group(1):
  243. matched_tech = matched_tech._replace(version=matches.group(1))
  244. self.report['tech'].add(matched_tech)
  245. # this tech is matched, GOTO next
  246. return
  247. def check_script(self, tech, script):
  248. """
  249. Check if request script src from page's HTML contains some database matches
  250. """
  251. # FIX repair to some database inconsistencies
  252. if isinstance(script, str) or isinstance(script, unicode):
  253. script = [script]
  254. for source in script:
  255. attr, extra = parse_regex_string(source)
  256. for src in self.data['script']:
  257. matches = re.search(attr, src, re.IGNORECASE)
  258. # Attr is empty for a "generic" tech meta
  259. if attr is '' or matches is not None:
  260. matched_tech = Tech(name=tech, version=None)
  261. # The version extra data is present
  262. if extra and extra['version']:
  263. if matches.group(1):
  264. matched_tech = matched_tech._replace(version=matches.group(1))
  265. self.report['tech'].add(matched_tech)
  266. # this tech is matched, GOTO next
  267. return
  268. def check_cookies(self, tech, cookies):
  269. """
  270. Check if request cookies match some database cookies
  271. """
  272. for cookie in cookies:
  273. # cookies in db are regexes so we must test them all
  274. cookie = cookie.replace("*","") # FIX for "Fe26.2**" hapi.js cookie in the database
  275. for biscuit in self.data['cookies'].keys():
  276. matches = re.search(cookie, biscuit, re.IGNORECASE)
  277. if matches is not None:
  278. if cookies[cookie] != '':
  279. # Let's check the cookie content
  280. content = self.data['cookies'][biscuit]
  281. matches = re.search(cookies[cookie], content, re.IGNORECASE)
  282. if matches is None:
  283. # No match, exit
  284. return
  285. matched_tech = Tech(name=tech, version=None)
  286. self.report['tech'].add(matched_tech)
  287. # this tech is matched, GOTO next
  288. return
  289. def check_url(self, tech, url):
  290. """
  291. Check if request url match some database url rules
  292. """
  293. if isinstance(url, str) or isinstance(url, unicode):
  294. url = [url]
  295. for source in url:
  296. matches = re.search(source, self.data['url'], re.IGNORECASE)
  297. if matches is not None:
  298. matched_tech = Tech(name=tech, version=None)
  299. self.report['tech'].add(matched_tech)
  300. # this tech is matched, GOTO next
  301. return
  302. def generate_report(self, output_format):
  303. """
  304. Generate a report
  305. """
  306. if output_format == Format['grep']:
  307. techs = ""
  308. for tech in self.report['tech']:
  309. if len(techs): techs += "//"
  310. techs += "{}/{}".format(tech.name, 'unknown' if tech.version is None else tech.version)
  311. headers = ""
  312. for header in self.report['headers']:
  313. if len(headers): headers += "//"
  314. headers += "{}:{}".format(header["name"], header["value"])
  315. return "Url>{}\tTechs>{}\tHeaders>{}".format(self.data['url'], techs, headers)
  316. elif output_format == Format['json']:
  317. return json.loads(json.dumps(self.report, cls=encoder.Encoder))
  318. else:
  319. retval = ""
  320. retval += "Target URL: {}\n".format(self.data['url'])
  321. if self.report['tech']:
  322. retval += "Detected technologies:\n"
  323. for tech in self.report['tech']:
  324. retval += "\t- {} {}\n".format(tech.name, '' if tech.version is None else tech.version)
  325. if self.report['headers']:
  326. retval += "Detected the following interesting custom headers:\n"
  327. for header in self.report['headers']:
  328. retval += "\t- {}: {}\n".format(header["name"], header["value"])
  329. return retval