webtech.py 7.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232
  1. #!/usr/bin/env python
  2. # -*- coding: utf-8 -*-
  3. import os
  4. import json
  5. import random
  6. try:
  7. from urlparse import urlparse
  8. except ImportError: # For Python 3
  9. from urllib.parse import urlparse
  10. from . import database
  11. from .utils import Format, FileNotFoundException, ConnectionException
  12. from .target import Target, BURP
  13. from .__version__ import __version__ as VERSION
  14. def default_user_agent():
  15. return "webtech/{}".format(VERSION)
  16. def get_random_user_agent():
  17. """
  18. Get a random user agent from a file
  19. """
  20. ua_file = os.path.join(os.path.realpath(os.path.dirname(__file__)), "ua.txt")
  21. try:
  22. with open(ua_file) as f:
  23. agents = f.readlines()
  24. return random.choice(agents).strip()
  25. except FileNotFoundException as e:
  26. print(e)
  27. print('Please: Reinstall webtech correctly or provide a valid User-Agent list')
  28. exit(-1)
  29. class WebTech():
  30. """
  31. Main class. The orchestrator that decides what to do.
  32. This class is the bridge between the tech's database and the Targets' data
  33. """
  34. COMMON_HEADERS = ['Accept-Ranges', 'Access-Control-Allow-Methods', 'Access-Control-Allow-Origin', 'Age', 'Cache-Control', 'Connection',
  35. 'Content-Encoding', 'Content-Language', 'Content-Length', 'Content-Security-Policy', 'Content-Type', 'Date', 'ETag', 'Expect-CT', 'Expires',
  36. 'Feature-Policy', 'Keep-Alive', 'Last-Modified', 'Link', 'Location', 'P3P', 'Pragma', 'Referrer-Policy', 'Set-Cookie',
  37. 'Strict-Transport-Security', 'Transfer-Encoding', 'Vary', 'X-Accel-Buffering', 'X-Cache', 'X-Cache-Hits', 'X-Content-Security-Policy',
  38. 'X-Content-Type-Options', 'X-Frame-Options', 'X-Timer', 'X-WebKit-CSP', 'X-XSS-Protection']
  39. COMMON_HEADERS = [ch.lower() for ch in COMMON_HEADERS]
  40. # 'cats' tech categories
  41. # 'implies' website is using also this tech
  42. # 'excludes' exclude this tech
  43. # 'website' website for this tech
  44. # 'icon' icon for this tech (useless)
  45. # 'headers' check this patter in headers
  46. # 'html' check this regex in html
  47. # 'meta' check this patter in meta
  48. # 'js' check this expression in javascript context
  49. # 'cookies' check this patter in cookies
  50. # 'script' check this pattern in scripts src
  51. # 'url' check this patter in url
  52. def __init__(self, options=None):
  53. update = False if options is None else options.get('update_db', False)
  54. success = database.update_database(force=update, burp=BURP)
  55. self.fail = False
  56. if not success:
  57. # Hack for not crashing Burp
  58. self.fail = True
  59. return
  60. with open(database.WAPPALYZER_DATABASE_FILE) as f:
  61. self.db = json.load(f)
  62. with open(database.DATABASE_FILE) as f:
  63. self.db = database.merge_databases(self.db, json.load(f))
  64. # Output text only
  65. self.output_format = Format['text']
  66. # Default user agent
  67. self.USER_AGENT = default_user_agent()
  68. if options is None:
  69. return
  70. if options.get('database_file'):
  71. try:
  72. with open(options.get('database_file')) as f:
  73. self.db = database.merge_databases(self.db, json.load(f))
  74. except (FileNotFoundException, ValueError) as e:
  75. print(e)
  76. exit(-1)
  77. self.urls = options.get('urls', [])
  78. if options.get('urls_file'):
  79. try:
  80. with open(options.get('urls_file')) as f:
  81. self.urls = f.readlines()
  82. except FileNotFoundException as e:
  83. print(e)
  84. exit(-1)
  85. if options.get('user_agent'):
  86. self.USER_AGENT = options.get('user_agent')
  87. elif options.get('random_user_agent'):
  88. self.USER_AGENT = get_random_user_agent()
  89. if options.get('grep'):
  90. # Greppable output
  91. self.output_format = Format['grep']
  92. elif options.get('json'):
  93. # JSON output
  94. self.output_format = Format['json']
  95. try:
  96. self.timeout = int(options.get('timeout', '10'))
  97. except ValueError:
  98. self.timeout = 10
  99. def start(self):
  100. """
  101. Start the engine, fetch an URL and report the findings
  102. """
  103. if self.fail:
  104. # Fail badly
  105. exit(1)
  106. self.output = {}
  107. for url in self.urls:
  108. try:
  109. temp_output = self.start_from_url(url)
  110. except (FileNotFoundException, ValueError) as e:
  111. print(e)
  112. continue
  113. except ConnectionException as e:
  114. print("Connection error while scanning {}".format(url))
  115. continue
  116. if self.output_format == Format['text']:
  117. print(temp_output)
  118. else:
  119. self.output[url] = temp_output
  120. if self.output_format == Format['json']:
  121. print(self.output)
  122. else:
  123. for o in self.output.values():
  124. print(o)
  125. def start_from_url(self, url, headers={}, timeout=None):
  126. """
  127. Start webtech on a single URL/target
  128. Returns the report for that specific target
  129. """
  130. timeout = timeout or self.timeout
  131. target = Target()
  132. parsed_url = urlparse(url)
  133. if "http" in parsed_url.scheme:
  134. # Scrape the URL by making a request
  135. h = {'User-Agent': self.USER_AGENT}
  136. h.update(headers)
  137. target.scrape_url(url, headers=h, cookies={}, timeout=timeout)
  138. elif "file" in parsed_url.scheme:
  139. # Load the file and read it
  140. target.parse_http_file(url)
  141. else:
  142. raise ValueError("Invalid scheme {} for URL {}. Only 'http', 'https' and 'file' are supported".format(parsed_url.scheme, url))
  143. return self.perform(target)
  144. def start_from_json(self, exchange):
  145. """
  146. Start webtech on a single target from a HTTP request-response exchange as JSON serialized string
  147. This function is the entry point for the Burp extension
  148. """
  149. return self.start_from_exchange(json.loads(exchange))
  150. def start_from_exchange(self, exchange):
  151. """
  152. Start webtech on a single target from a HTTP request-response exchange as Object
  153. """
  154. target = Target()
  155. target.parse_http_response(exchange['response'])
  156. target.parse_http_request(exchange['request'], replay=False)
  157. return self.perform(target)
  158. def perform(self, target):
  159. """
  160. Performs all the checks on the current target received as argument
  161. This function can be executed on multiple threads since "it doesn't access on shared data"
  162. """
  163. if self.fail:
  164. # Fail gracefully
  165. if self.output_format == Format['json']:
  166. return {}
  167. else:
  168. return ''
  169. target.whitelist_data(self.COMMON_HEADERS)
  170. # Cycle through all the db technologies and do all the checks
  171. # It's more efficent cycling all technologies and match against the target once for tech
  172. # instead of cycling each target feature against every technology
  173. for tech in self.db["apps"]:
  174. t = self.db["apps"][tech]
  175. headers = t.get("headers")
  176. html = t.get("html")
  177. meta = t.get("meta")
  178. cookies = t.get("cookies")
  179. script = t.get("script")
  180. url = t.get("url")
  181. if headers:
  182. target.check_headers(tech, headers)
  183. if html:
  184. target.check_html(tech, html)
  185. if meta:
  186. target.check_meta(tech, meta)
  187. if cookies:
  188. target.check_cookies(tech, cookies)
  189. if script:
  190. target.check_script(tech, script)
  191. if url:
  192. target.check_url(tech, url)
  193. return target.generate_report(self.output_format)