diff --git a/code/url-analyzer.ipynb b/code/url-analyzer.ipynb new file mode 100644 index 0000000..5cdc6f5 --- /dev/null +++ b/code/url-analyzer.ipynb @@ -0,0 +1,4093 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Generating statistics: https://hoxhunt.com/\n" + ] + }, + { + "data": { + "application/javascript": [ + "/* Put everything inside the global mpl namespace */\n", + "window.mpl = {};\n", + "\n", + "\n", + "mpl.get_websocket_type = function() {\n", + " if (typeof(WebSocket) !== 'undefined') {\n", + " return WebSocket;\n", + " } else if (typeof(MozWebSocket) !== 'undefined') {\n", + " return MozWebSocket;\n", + " } else {\n", + " alert('Your browser does not have WebSocket support. ' +\n", + " 'Please try Chrome, Safari or Firefox ≥ 6. ' +\n", + " 'Firefox 4 and 5 are also supported but you ' +\n", + " 'have to enable WebSockets in about:config.');\n", + " };\n", + "}\n", + "\n", + "mpl.figure = function(figure_id, websocket, ondownload, parent_element) {\n", + " this.id = figure_id;\n", + "\n", + " this.ws = websocket;\n", + "\n", + " this.supports_binary = (this.ws.binaryType != undefined);\n", + "\n", + " if (!this.supports_binary) {\n", + " var warnings = document.getElementById(\"mpl-warnings\");\n", + " if (warnings) {\n", + " warnings.style.display = 'block';\n", + " warnings.textContent = (\n", + " \"This browser does not support binary websocket messages. \" +\n", + " \"Performance may be slow.\");\n", + " }\n", + " }\n", + "\n", + " this.imageObj = new Image();\n", + "\n", + " this.context = undefined;\n", + " this.message = undefined;\n", + " this.canvas = undefined;\n", + " this.rubberband_canvas = undefined;\n", + " this.rubberband_context = undefined;\n", + " this.format_dropdown = undefined;\n", + "\n", + " this.image_mode = 'full';\n", + "\n", + " this.root = $('
');\n", + " this._root_extra_style(this.root)\n", + " this.root.attr('style', 'display: inline-block');\n", + "\n", + " $(parent_element).append(this.root);\n", + "\n", + " this._init_header(this);\n", + " this._init_canvas(this);\n", + " this._init_toolbar(this);\n", + "\n", + " var fig = this;\n", + "\n", + " this.waiting = false;\n", + "\n", + " this.ws.onopen = function () {\n", + " fig.send_message(\"supports_binary\", {value: fig.supports_binary});\n", + " fig.send_message(\"send_image_mode\", {});\n", + " if (mpl.ratio != 1) {\n", + " fig.send_message(\"set_dpi_ratio\", {'dpi_ratio': mpl.ratio});\n", + " }\n", + " fig.send_message(\"refresh\", {});\n", + " }\n", + "\n", + " this.imageObj.onload = function() {\n", + " if (fig.image_mode == 'full') {\n", + " // Full images could contain transparency (where diff images\n", + " // almost always do), so we need to clear the canvas so that\n", + " // there is no ghosting.\n", + " fig.context.clearRect(0, 0, fig.canvas.width, fig.canvas.height);\n", + " }\n", + " fig.context.drawImage(fig.imageObj, 0, 0);\n", + " };\n", + "\n", + " this.imageObj.onunload = function() {\n", + " fig.ws.close();\n", + " }\n", + "\n", + " this.ws.onmessage = this._make_on_message_function(this);\n", + "\n", + " this.ondownload = ondownload;\n", + "}\n", + "\n", + "mpl.figure.prototype._init_header = function() {\n", + " var titlebar = $(\n", + " '
');\n", + " var titletext = $(\n", + " '
');\n", + " titlebar.append(titletext)\n", + " this.root.append(titlebar);\n", + " this.header = titletext[0];\n", + "}\n", + "\n", + "\n", + "\n", + "mpl.figure.prototype._canvas_extra_style = function(canvas_div) {\n", + "\n", + "}\n", + "\n", + "\n", + "mpl.figure.prototype._root_extra_style = function(canvas_div) {\n", + "\n", + "}\n", + "\n", + "mpl.figure.prototype._init_canvas = function() {\n", + " var fig = this;\n", + "\n", + " var canvas_div = $('
');\n", + "\n", + " canvas_div.attr('style', 'position: relative; clear: both; outline: 0');\n", + "\n", + " function canvas_keyboard_event(event) {\n", + " return fig.key_event(event, event['data']);\n", + " }\n", + "\n", + " canvas_div.keydown('key_press', canvas_keyboard_event);\n", + " canvas_div.keyup('key_release', canvas_keyboard_event);\n", + " this.canvas_div = canvas_div\n", + " this._canvas_extra_style(canvas_div)\n", + " this.root.append(canvas_div);\n", + "\n", + " var canvas = $('');\n", + " canvas.addClass('mpl-canvas');\n", + " canvas.attr('style', \"left: 0; top: 0; z-index: 0; outline: 0\")\n", + "\n", + " this.canvas = canvas[0];\n", + " this.context = canvas[0].getContext(\"2d\");\n", + "\n", + " var backingStore = this.context.backingStorePixelRatio ||\n", + "\tthis.context.webkitBackingStorePixelRatio ||\n", + "\tthis.context.mozBackingStorePixelRatio ||\n", + "\tthis.context.msBackingStorePixelRatio ||\n", + "\tthis.context.oBackingStorePixelRatio ||\n", + "\tthis.context.backingStorePixelRatio || 1;\n", + "\n", + " mpl.ratio = (window.devicePixelRatio || 1) / backingStore;\n", + "\n", + " var rubberband = $('');\n", + " rubberband.attr('style', \"position: absolute; left: 0; top: 0; z-index: 1;\")\n", + "\n", + " var pass_mouse_events = true;\n", + "\n", + " canvas_div.resizable({\n", + " start: function(event, ui) {\n", + " pass_mouse_events = false;\n", + " },\n", + " resize: function(event, ui) {\n", + " fig.request_resize(ui.size.width, ui.size.height);\n", + " },\n", + " stop: function(event, ui) {\n", + " pass_mouse_events = true;\n", + " fig.request_resize(ui.size.width, ui.size.height);\n", + " },\n", + " });\n", + "\n", + " function mouse_event_fn(event) {\n", + " if (pass_mouse_events)\n", + " return fig.mouse_event(event, event['data']);\n", + " }\n", + "\n", + " rubberband.mousedown('button_press', mouse_event_fn);\n", + " rubberband.mouseup('button_release', mouse_event_fn);\n", + " // Throttle sequential mouse events to 1 every 20ms.\n", + " rubberband.mousemove('motion_notify', mouse_event_fn);\n", + "\n", + " rubberband.mouseenter('figure_enter', mouse_event_fn);\n", + " rubberband.mouseleave('figure_leave', mouse_event_fn);\n", + "\n", + " canvas_div.on(\"wheel\", function (event) {\n", + " event = event.originalEvent;\n", + " event['data'] = 'scroll'\n", + " if (event.deltaY < 0) {\n", + " event.step = 1;\n", + " } else {\n", + " event.step = -1;\n", + " }\n", + " mouse_event_fn(event);\n", + " });\n", + "\n", + " canvas_div.append(canvas);\n", + " canvas_div.append(rubberband);\n", + "\n", + " this.rubberband = rubberband;\n", + " this.rubberband_canvas = rubberband[0];\n", + " this.rubberband_context = rubberband[0].getContext(\"2d\");\n", + " this.rubberband_context.strokeStyle = \"#000000\";\n", + "\n", + " this._resize_canvas = function(width, height) {\n", + " // Keep the size of the canvas, canvas container, and rubber band\n", + " // canvas in synch.\n", + " canvas_div.css('width', width)\n", + " canvas_div.css('height', height)\n", + "\n", + " canvas.attr('width', width * mpl.ratio);\n", + " canvas.attr('height', height * mpl.ratio);\n", + " canvas.attr('style', 'width: ' + width + 'px; height: ' + height + 'px;');\n", + "\n", + " rubberband.attr('width', width);\n", + " rubberband.attr('height', height);\n", + " }\n", + "\n", + " // Set the figure to an initial 600x600px, this will subsequently be updated\n", + " // upon first draw.\n", + " this._resize_canvas(600, 600);\n", + "\n", + " // Disable right mouse context menu.\n", + " $(this.rubberband_canvas).bind(\"contextmenu\",function(e){\n", + " return false;\n", + " });\n", + "\n", + " function set_focus () {\n", + " canvas.focus();\n", + " canvas_div.focus();\n", + " }\n", + "\n", + " window.setTimeout(set_focus, 100);\n", + "}\n", + "\n", + "mpl.figure.prototype._init_toolbar = function() {\n", + " var fig = this;\n", + "\n", + " var nav_element = $('
');\n", + " nav_element.attr('style', 'width: 100%');\n", + " this.root.append(nav_element);\n", + "\n", + " // Define a callback function for later on.\n", + " function toolbar_event(event) {\n", + " return fig.toolbar_button_onclick(event['data']);\n", + " }\n", + " function toolbar_mouse_event(event) {\n", + " return fig.toolbar_button_onmouseover(event['data']);\n", + " }\n", + "\n", + " for(var toolbar_ind in mpl.toolbar_items) {\n", + " var name = mpl.toolbar_items[toolbar_ind][0];\n", + " var tooltip = mpl.toolbar_items[toolbar_ind][1];\n", + " var image = mpl.toolbar_items[toolbar_ind][2];\n", + " var method_name = mpl.toolbar_items[toolbar_ind][3];\n", + "\n", + " if (!name) {\n", + " // put a spacer in here.\n", + " continue;\n", + " }\n", + " var button = $('');\n", + " button.click(method_name, toolbar_event);\n", + " button.mouseover(tooltip, toolbar_mouse_event);\n", + " nav_element.append(button);\n", + " }\n", + "\n", + " // Add the status bar.\n", + " var status_bar = $('');\n", + " nav_element.append(status_bar);\n", + " this.message = status_bar[0];\n", + "\n", + " // Add the close button to the window.\n", + " var buttongrp = $('
');\n", + " var button = $('');\n", + " button.click(function (evt) { fig.handle_close(fig, {}); } );\n", + " button.mouseover('Stop Interaction', toolbar_mouse_event);\n", + " buttongrp.append(button);\n", + " var titlebar = this.root.find($('.ui-dialog-titlebar'));\n", + " titlebar.prepend(buttongrp);\n", + "}\n", + "\n", + "mpl.figure.prototype._root_extra_style = function(el){\n", + " var fig = this\n", + " el.on(\"remove\", function(){\n", + "\tfig.close_ws(fig, {});\n", + " });\n", + "}\n", + "\n", + "mpl.figure.prototype._canvas_extra_style = function(el){\n", + " // this is important to make the div 'focusable\n", + " el.attr('tabindex', 0)\n", + " // reach out to IPython and tell the keyboard manager to turn it's self\n", + " // off when our div gets focus\n", + "\n", + " // location in version 3\n", + " if (IPython.notebook.keyboard_manager) {\n", + " IPython.notebook.keyboard_manager.register_events(el);\n", + " }\n", + " else {\n", + " // location in version 2\n", + " IPython.keyboard_manager.register_events(el);\n", + " }\n", + "\n", + "}\n", + "\n", + "mpl.figure.prototype._key_event_extra = function(event, name) {\n", + " var manager = IPython.notebook.keyboard_manager;\n", + " if (!manager)\n", + " manager = IPython.keyboard_manager;\n", + "\n", + " // Check for shift+enter\n", + " if (event.shiftKey && event.which == 13) {\n", + " this.canvas_div.blur();\n", + " // select the cell after this one\n", + " var index = IPython.notebook.find_cell_index(this.cell_info[0]);\n", + " IPython.notebook.select(index + 1);\n", + " }\n", + "}\n", + "\n", + "mpl.figure.prototype.handle_save = function(fig, msg) {\n", + " fig.ondownload(fig, null);\n", + "}\n", + "\n", + "\n", + "mpl.find_output_cell = function(html_output) {\n", + " // Return the cell and output element which can be found *uniquely* in the notebook.\n", + " // Note - this is a bit hacky, but it is done because the \"notebook_saving.Notebook\"\n", + " // IPython event is triggered only after the cells have been serialised, which for\n", + " // our purposes (turning an active figure into a static one), is too late.\n", + " var cells = IPython.notebook.get_cells();\n", + " var ncells = cells.length;\n", + " for (var i=0; i= 3 moved mimebundle to data attribute of output\n", + " data = data.data;\n", + " }\n", + " if (data['text/html'] == html_output) {\n", + " return [cell, data, j];\n", + " }\n", + " }\n", + " }\n", + " }\n", + "}\n", + "\n", + "// Register the function which deals with the matplotlib target/channel.\n", + "// The kernel may be null if the page has been refreshed.\n", + "if (IPython.notebook.kernel != null) {\n", + " IPython.notebook.kernel.comm_manager.register_target('matplotlib', mpl.mpl_figure_comm);\n", + "}\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Generating statistics: https://ts.fi\n" + ] + }, + { + "data": { + "application/javascript": [ + "/* Put everything inside the global mpl namespace */\n", + "window.mpl = {};\n", + "\n", + "\n", + "mpl.get_websocket_type = function() {\n", + " if (typeof(WebSocket) !== 'undefined') {\n", + " return WebSocket;\n", + " } else if (typeof(MozWebSocket) !== 'undefined') {\n", + " return MozWebSocket;\n", + " } else {\n", + " alert('Your browser does not have WebSocket support. ' +\n", + " 'Please try Chrome, Safari or Firefox ≥ 6. ' +\n", + " 'Firefox 4 and 5 are also supported but you ' +\n", + " 'have to enable WebSockets in about:config.');\n", + " };\n", + "}\n", + "\n", + "mpl.figure = function(figure_id, websocket, ondownload, parent_element) {\n", + " this.id = figure_id;\n", + "\n", + " this.ws = websocket;\n", + "\n", + " this.supports_binary = (this.ws.binaryType != undefined);\n", + "\n", + " if (!this.supports_binary) {\n", + " var warnings = document.getElementById(\"mpl-warnings\");\n", + " if (warnings) {\n", + " warnings.style.display = 'block';\n", + " warnings.textContent = (\n", + " \"This browser does not support binary websocket messages. \" +\n", + " \"Performance may be slow.\");\n", + " }\n", + " }\n", + "\n", + " this.imageObj = new Image();\n", + "\n", + " this.context = undefined;\n", + " this.message = undefined;\n", + " this.canvas = undefined;\n", + " this.rubberband_canvas = undefined;\n", + " this.rubberband_context = undefined;\n", + " this.format_dropdown = undefined;\n", + "\n", + " this.image_mode = 'full';\n", + "\n", + " this.root = $('
');\n", + " this._root_extra_style(this.root)\n", + " this.root.attr('style', 'display: inline-block');\n", + "\n", + " $(parent_element).append(this.root);\n", + "\n", + " this._init_header(this);\n", + " this._init_canvas(this);\n", + " this._init_toolbar(this);\n", + "\n", + " var fig = this;\n", + "\n", + " this.waiting = false;\n", + "\n", + " this.ws.onopen = function () {\n", + " fig.send_message(\"supports_binary\", {value: fig.supports_binary});\n", + " fig.send_message(\"send_image_mode\", {});\n", + " if (mpl.ratio != 1) {\n", + " fig.send_message(\"set_dpi_ratio\", {'dpi_ratio': mpl.ratio});\n", + " }\n", + " fig.send_message(\"refresh\", {});\n", + " }\n", + "\n", + " this.imageObj.onload = function() {\n", + " if (fig.image_mode == 'full') {\n", + " // Full images could contain transparency (where diff images\n", + " // almost always do), so we need to clear the canvas so that\n", + " // there is no ghosting.\n", + " fig.context.clearRect(0, 0, fig.canvas.width, fig.canvas.height);\n", + " }\n", + " fig.context.drawImage(fig.imageObj, 0, 0);\n", + " };\n", + "\n", + " this.imageObj.onunload = function() {\n", + " fig.ws.close();\n", + " }\n", + "\n", + " this.ws.onmessage = this._make_on_message_function(this);\n", + "\n", + " this.ondownload = ondownload;\n", + "}\n", + "\n", + "mpl.figure.prototype._init_header = function() {\n", + " var titlebar = $(\n", + " '
');\n", + " var titletext = $(\n", + " '
');\n", + " titlebar.append(titletext)\n", + " this.root.append(titlebar);\n", + " this.header = titletext[0];\n", + "}\n", + "\n", + "\n", + "\n", + "mpl.figure.prototype._canvas_extra_style = function(canvas_div) {\n", + "\n", + "}\n", + "\n", + "\n", + "mpl.figure.prototype._root_extra_style = function(canvas_div) {\n", + "\n", + "}\n", + "\n", + "mpl.figure.prototype._init_canvas = function() {\n", + " var fig = this;\n", + "\n", + " var canvas_div = $('
');\n", + "\n", + " canvas_div.attr('style', 'position: relative; clear: both; outline: 0');\n", + "\n", + " function canvas_keyboard_event(event) {\n", + " return fig.key_event(event, event['data']);\n", + " }\n", + "\n", + " canvas_div.keydown('key_press', canvas_keyboard_event);\n", + " canvas_div.keyup('key_release', canvas_keyboard_event);\n", + " this.canvas_div = canvas_div\n", + " this._canvas_extra_style(canvas_div)\n", + " this.root.append(canvas_div);\n", + "\n", + " var canvas = $('');\n", + " canvas.addClass('mpl-canvas');\n", + " canvas.attr('style', \"left: 0; top: 0; z-index: 0; outline: 0\")\n", + "\n", + " this.canvas = canvas[0];\n", + " this.context = canvas[0].getContext(\"2d\");\n", + "\n", + " var backingStore = this.context.backingStorePixelRatio ||\n", + "\tthis.context.webkitBackingStorePixelRatio ||\n", + "\tthis.context.mozBackingStorePixelRatio ||\n", + "\tthis.context.msBackingStorePixelRatio ||\n", + "\tthis.context.oBackingStorePixelRatio ||\n", + "\tthis.context.backingStorePixelRatio || 1;\n", + "\n", + " mpl.ratio = (window.devicePixelRatio || 1) / backingStore;\n", + "\n", + " var rubberband = $('');\n", + " rubberband.attr('style', \"position: absolute; left: 0; top: 0; z-index: 1;\")\n", + "\n", + " var pass_mouse_events = true;\n", + "\n", + " canvas_div.resizable({\n", + " start: function(event, ui) {\n", + " pass_mouse_events = false;\n", + " },\n", + " resize: function(event, ui) {\n", + " fig.request_resize(ui.size.width, ui.size.height);\n", + " },\n", + " stop: function(event, ui) {\n", + " pass_mouse_events = true;\n", + " fig.request_resize(ui.size.width, ui.size.height);\n", + " },\n", + " });\n", + "\n", + " function mouse_event_fn(event) {\n", + " if (pass_mouse_events)\n", + " return fig.mouse_event(event, event['data']);\n", + " }\n", + "\n", + " rubberband.mousedown('button_press', mouse_event_fn);\n", + " rubberband.mouseup('button_release', mouse_event_fn);\n", + " // Throttle sequential mouse events to 1 every 20ms.\n", + " rubberband.mousemove('motion_notify', mouse_event_fn);\n", + "\n", + " rubberband.mouseenter('figure_enter', mouse_event_fn);\n", + " rubberband.mouseleave('figure_leave', mouse_event_fn);\n", + "\n", + " canvas_div.on(\"wheel\", function (event) {\n", + " event = event.originalEvent;\n", + " event['data'] = 'scroll'\n", + " if (event.deltaY < 0) {\n", + " event.step = 1;\n", + " } else {\n", + " event.step = -1;\n", + " }\n", + " mouse_event_fn(event);\n", + " });\n", + "\n", + " canvas_div.append(canvas);\n", + " canvas_div.append(rubberband);\n", + "\n", + " this.rubberband = rubberband;\n", + " this.rubberband_canvas = rubberband[0];\n", + " this.rubberband_context = rubberband[0].getContext(\"2d\");\n", + " this.rubberband_context.strokeStyle = \"#000000\";\n", + "\n", + " this._resize_canvas = function(width, height) {\n", + " // Keep the size of the canvas, canvas container, and rubber band\n", + " // canvas in synch.\n", + " canvas_div.css('width', width)\n", + " canvas_div.css('height', height)\n", + "\n", + " canvas.attr('width', width * mpl.ratio);\n", + " canvas.attr('height', height * mpl.ratio);\n", + " canvas.attr('style', 'width: ' + width + 'px; height: ' + height + 'px;');\n", + "\n", + " rubberband.attr('width', width);\n", + " rubberband.attr('height', height);\n", + " }\n", + "\n", + " // Set the figure to an initial 600x600px, this will subsequently be updated\n", + " // upon first draw.\n", + " this._resize_canvas(600, 600);\n", + "\n", + " // Disable right mouse context menu.\n", + " $(this.rubberband_canvas).bind(\"contextmenu\",function(e){\n", + " return false;\n", + " });\n", + "\n", + " function set_focus () {\n", + " canvas.focus();\n", + " canvas_div.focus();\n", + " }\n", + "\n", + " window.setTimeout(set_focus, 100);\n", + "}\n", + "\n", + "mpl.figure.prototype._init_toolbar = function() {\n", + " var fig = this;\n", + "\n", + " var nav_element = $('
');\n", + " nav_element.attr('style', 'width: 100%');\n", + " this.root.append(nav_element);\n", + "\n", + " // Define a callback function for later on.\n", + " function toolbar_event(event) {\n", + " return fig.toolbar_button_onclick(event['data']);\n", + " }\n", + " function toolbar_mouse_event(event) {\n", + " return fig.toolbar_button_onmouseover(event['data']);\n", + " }\n", + "\n", + " for(var toolbar_ind in mpl.toolbar_items) {\n", + " var name = mpl.toolbar_items[toolbar_ind][0];\n", + " var tooltip = mpl.toolbar_items[toolbar_ind][1];\n", + " var image = mpl.toolbar_items[toolbar_ind][2];\n", + " var method_name = mpl.toolbar_items[toolbar_ind][3];\n", + "\n", + " if (!name) {\n", + " // put a spacer in here.\n", + " continue;\n", + " }\n", + " var button = $('');\n", + " button.click(method_name, toolbar_event);\n", + " button.mouseover(tooltip, toolbar_mouse_event);\n", + " nav_element.append(button);\n", + " }\n", + "\n", + " // Add the status bar.\n", + " var status_bar = $('');\n", + " nav_element.append(status_bar);\n", + " this.message = status_bar[0];\n", + "\n", + " // Add the close button to the window.\n", + " var buttongrp = $('
');\n", + " var button = $('');\n", + " button.click(function (evt) { fig.handle_close(fig, {}); } );\n", + " button.mouseover('Stop Interaction', toolbar_mouse_event);\n", + " buttongrp.append(button);\n", + " var titlebar = this.root.find($('.ui-dialog-titlebar'));\n", + " titlebar.prepend(buttongrp);\n", + "}\n", + "\n", + "mpl.figure.prototype._root_extra_style = function(el){\n", + " var fig = this\n", + " el.on(\"remove\", function(){\n", + "\tfig.close_ws(fig, {});\n", + " });\n", + "}\n", + "\n", + "mpl.figure.prototype._canvas_extra_style = function(el){\n", + " // this is important to make the div 'focusable\n", + " el.attr('tabindex', 0)\n", + " // reach out to IPython and tell the keyboard manager to turn it's self\n", + " // off when our div gets focus\n", + "\n", + " // location in version 3\n", + " if (IPython.notebook.keyboard_manager) {\n", + " IPython.notebook.keyboard_manager.register_events(el);\n", + " }\n", + " else {\n", + " // location in version 2\n", + " IPython.keyboard_manager.register_events(el);\n", + " }\n", + "\n", + "}\n", + "\n", + "mpl.figure.prototype._key_event_extra = function(event, name) {\n", + " var manager = IPython.notebook.keyboard_manager;\n", + " if (!manager)\n", + " manager = IPython.keyboard_manager;\n", + "\n", + " // Check for shift+enter\n", + " if (event.shiftKey && event.which == 13) {\n", + " this.canvas_div.blur();\n", + " // select the cell after this one\n", + " var index = IPython.notebook.find_cell_index(this.cell_info[0]);\n", + " IPython.notebook.select(index + 1);\n", + " }\n", + "}\n", + "\n", + "mpl.figure.prototype.handle_save = function(fig, msg) {\n", + " fig.ondownload(fig, null);\n", + "}\n", + "\n", + "\n", + "mpl.find_output_cell = function(html_output) {\n", + " // Return the cell and output element which can be found *uniquely* in the notebook.\n", + " // Note - this is a bit hacky, but it is done because the \"notebook_saving.Notebook\"\n", + " // IPython event is triggered only after the cells have been serialised, which for\n", + " // our purposes (turning an active figure into a static one), is too late.\n", + " var cells = IPython.notebook.get_cells();\n", + " var ncells = cells.length;\n", + " for (var i=0; i= 3 moved mimebundle to data attribute of output\n", + " data = data.data;\n", + " }\n", + " if (data['text/html'] == html_output) {\n", + " return [cell, data, j];\n", + " }\n", + " }\n", + " }\n", + " }\n", + "}\n", + "\n", + "// Register the function which deals with the matplotlib target/channel.\n", + "// The kernel may be null if the page has been refreshed.\n", + "if (IPython.notebook.kernel != null) {\n", + " IPython.notebook.kernel.comm_manager.register_target('matplotlib', mpl.mpl_figure_comm);\n", + "}\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "#!/bin/env python\n", + "\n", + "\"\"\"\n", + "URL data extractor\n", + "\n", + "Pekka Helenius \n", + "\n", + "Requirements:\n", + "\n", + "Python 3\n", + "Python 3 BeautifulSoup4 (python-beautifulsoup4)\n", + "Python 3 whois (python-whois; PyPI)\n", + "Python 3 JSON Schema (python-jsonschema)\n", + "Python 3 Numpy (python-numpy)\n", + "Python 3 matplotlib (python-matplotlib)\n", + "\n", + "TODO: URL domain part length comparison analysis\n", + "TODO: URL non-TLD part length comparison analysis\n", + " - in phishing webpages, URL tends to be much longer than legitimate webpages\n", + " however, domains themselves tend to be much shorter (without TLD)\n", + " - phishing URLs often contain more number of dots and subdomains than legitimate URLs\n", + " - legitimate: robots.txt redirects bots to a legitimate domain rather than to the original phishing domain\n", + "\n", + "TODO: Website visual similarity analysis\n", + "TODO: consistency of RDN usage in HTML data\n", + "\"\"\"\n", + "\n", + "######################################\n", + "\n", + "%matplotlib notebook\n", + "import matplotlib.pyplot as plt\n", + "\n", + "from bs4 import BeautifulSoup as bs\n", + "from collections import Counter\n", + "from datetime import date, datetime\n", + "import json\n", + "import os\n", + "import re\n", + "import requests\n", + "from time import sleep\n", + "import urllib\n", + "from whois import whois\n", + "\n", + "# Target URLs\n", + "urls = [\n", + " \"https://hoxhunt.com/\",\n", + " \"https://hs.fi\",\n", + " \"https://ts.fi\",\n", + " \"https://facebook.com\"\n", + "]\n", + "\n", + "# Some web servers may block our request unless we set a widely used, well-known user agent string\n", + "request_headers = {\n", + " 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36'\n", + "}\n", + "\n", + "# Date format for domain timestamps\n", + "dateformat = \"%Y/%m/%d\"\n", + "\n", + "# All webpages may not like fetching data too fast\n", + "# Sleep time in seconds\n", + "sleep_interval_between_requests = 0.5\n", + "\n", + "# Write JSON results to a file?\n", + "use_file = True\n", + "# Full file path + name\n", + "filename = os.getcwd() + \"/\" + \"url_info.json\"\n", + "\n", + "# Generate plot from existing JSON data?\n", + "plot_only = False\n", + "\n", + "# Save generated plot images?\n", + "save_plot_images = True\n", + "\n", + "# DPI of plot images\n", + "plot_images_dpi = 150\n", + "\n", + "# Common link attribute references in various HTML elements\n", + "link_refs = {\n", + " 'a': 'href',\n", + " 'img': 'src',\n", + " 'script': 'src'\n", + "}\n", + "\n", + "############################################################################\n", + "############################################################################\n", + "\n", + "class json_url_data(object):\n", + "\n", + "# def __init__(self):\n", + "\n", + "######################################\n", + " \"\"\"\n", + " Set a new HTTP session and get response.\n", + "\n", + " Returns a requests.models.Response object.\n", + " \"\"\"\n", + " def set_session(self, url, method='get', redirects=True):\n", + " \n", + " # HTTP response status codes 1XX, 2XX and 3XX are OK\n", + " # Treat other codes as errors\n", + " sc = re.compile(r\"^[123]{1}[0-9]{2}\")\n", + " \n", + " sleep(sleep_interval_between_requests)\n", + " \n", + " try:\n", + " session = requests.Session()\n", + " response = session.request(method, url, headers=request_headers, allow_redirects=redirects)\n", + " \n", + " if not sc.match(str(response.status_code)):\n", + " raise Exception(\"Error: got invalid response status from the web server\")\n", + " return response\n", + " \n", + " except:\n", + " raise Exception(\"Error: HTTP session could not be established. URL: '\" + url + \"' (method: \" + method + \")\") from None\n", + "\n", + "######################################\n", + " \"\"\"\n", + " Fetch HTML data.\n", + "\n", + " Returns a bs4.BeautifulSoup object.\n", + " \"\"\"\n", + " def get_html_data(self, url):\n", + " \n", + " try:\n", + " data = bs(self.set_session(url).content, 'html.parser')\n", + " return data\n", + " except:\n", + " raise Exception(\"Error: HTML data could not be retrieved\")\n", + "\n", + "######################################\n", + " \"\"\"\n", + " Get URL redirects and related HTTP status codes.\n", + "\n", + " Returns a list object.\n", + " \"\"\"\n", + " def get_url_redirects(self, url):\n", + " \n", + " response = self.set_session(url)\n", + " list_data = []\n", + " \n", + " if response.history:\n", + " \n", + " for r in response.history:\n", + " list_data.append({'redirect_url': r.url, 'status': r.status_code})\n", + " \n", + " return list_data\n", + "\n", + "######################################\n", + " \"\"\"\n", + " Extract title HTML element contents from given HTML data.\n", + "\n", + " Returns a string object.\n", + " \"\"\"\n", + " def get_webpage_title(self, url):\n", + " \n", + " html_data = self.get_html_data(url)\n", + " \n", + " title = html_data.title.string\n", + " return title\n", + "\n", + "######################################\n", + " \"\"\"\n", + " Get WHOIS domain data.\n", + "\n", + " Returns a dict object.\n", + " \"\"\"\n", + " def get_whois_data(self, url):\n", + " dict_data = whois(url)\n", + " return dict_data\n", + "\n", + "######################################\n", + " \"\"\"\n", + " Get domain name based on WHOIS domain data.\n", + " \"\"\"\n", + " def get_domain_name(self, url):\n", + " domain_name = self.get_whois_data(url).domain_name\n", + " \n", + " if type(domain_name) is list:\n", + " return domain_name[0].lower()\n", + " else:\n", + " return domain_name.lower()\n", + "\n", + "######################################\n", + " \"\"\"\n", + " Get initial and final URLs\n", + " \n", + " Compare whether the final (destination) URL\n", + " matches with the initial URL in a request.\n", + " \n", + " Returns a dict object.\n", + " \"\"\"\n", + " def get_startfinal_urls(self, url):\n", + " \n", + " response = self.set_session(url)\n", + " end_url = response.url\n", + " \n", + " start_match = False\n", + " final_match = False\n", + " \n", + " # dr = re.compile(r\"^([a-z]+://)?([^/]+)\")\n", + " # dr_group_lastindex = dr.match(url).lastindex\n", + " # domain_name = dr.match(url).group(dr_group_lastindex)\n", + " \n", + " domain_name = self.get_domain_name(url)\n", + " \n", + " if re.search(domain_name, end_url):\n", + " final_match = True\n", + " \n", + " dict_data = {\n", + " 'startfinal_urls': {\n", + " 'start_url': {\n", + " 'url': url\n", + " },\n", + " 'final_url': {\n", + " 'url': end_url, 'domain_match': final_match\n", + " }\n", + " }\n", + " }\n", + " \n", + " return dict_data\n", + "\n", + "######################################\n", + " \"\"\"\n", + " Get domain registrar\n", + " \n", + " Returns a dict object.\n", + " \"\"\"\n", + " def get_domain_registrar(self, url):\n", + " dict_data = {'domain_registrar': self.get_whois_data(url).registrar }\n", + " return dict_data\n", + "\n", + "######################################\n", + " \"\"\"\n", + " Do comparison between the domain name, extracted\n", + " from WHOIS domain data and contents of a title HTML\n", + " element, extracted from HTML data based on a given URL.\n", + " \n", + " Returns a dict object.\n", + " \"\"\"\n", + " def get_domain_title_match(self, url):\n", + " \n", + " domain_name = self.get_domain_name(url)\n", + " title = self.get_webpage_title(url)\n", + " \n", + " # If is string:\n", + " if type(domain_name) is str:\n", + " if re.search(domain_name, title, re.IGNORECASE):\n", + " match = True\n", + " else:\n", + " match = False\n", + " \n", + " # If is list:\n", + " elif type(domain_name) is list:\n", + " for d in domain_name:\n", + " if re.search(d, title, re.IGNORECASE):\n", + " match = True\n", + " break\n", + " else:\n", + " match = False\n", + " else:\n", + " match = False\n", + " \n", + " dict_data = {\n", + " 'webpage_title': title,\n", + " 'domain_in_webpage_title': match\n", + " }\n", + " \n", + " return dict_data\n", + "\n", + "######################################\n", + " \"\"\"\n", + " Get a single timestamp from given data\n", + " \n", + " Two scenarios are considered: dates argument is either\n", + " a list or a string. If it is a list, then we need\n", + " to decide which date value to extract.\n", + " \n", + " Returns a date object.\n", + " \"\"\"\n", + " def get_single_date(self, dates, newest=False):\n", + " \n", + " dates_epoch = []\n", + " \n", + " if type(dates) is list:\n", + " for d in dates:\n", + " dates_epoch.append(d.timestamp())\n", + " else:\n", + " dates_epoch.append(dates.timestamp())\n", + " \n", + " return datetime.fromtimestamp(sorted(dates_epoch, reverse=newest)[0])\n", + "\n", + "######################################\n", + " \"\"\"\n", + " Get domain time information based on WHOIS domain data.\n", + " \n", + " Returns a dict object.\n", + " \"\"\"\n", + " def get_domain_timeinfo(self, url):\n", + " \n", + " whois_data = self.get_whois_data(url)\n", + " domain_creation_date = self.get_single_date(whois_data.creation_date, newest = False)\n", + " domain_updated_date = self.get_single_date(whois_data.updated_date, newest = False)\n", + " domain_expiration_date = self.get_single_date(whois_data.expiration_date, newest = False)\n", + " \n", + " dict_data = {\n", + " 'domain_timestamps':\n", + " {\n", + " 'created': domain_creation_date.strftime(dateformat),\n", + " 'updated': domain_updated_date.strftime(dateformat),\n", + " 'expires': domain_expiration_date.strftime(dateformat)\n", + " }\n", + " }\n", + " \n", + " return dict_data\n", + "\n", + "######################################\n", + " \"\"\"\n", + " Get domain time information based on WHOIS domain data,\n", + " relative to the current date (UTC time).\n", + " \n", + " Returns a dict object.\n", + " \"\"\"\n", + " def get_domain_timeinfo_relative(self, url):\n", + " \n", + " date_now = datetime.utcnow()\n", + " \n", + " whois_data = self.get_whois_data(url)\n", + " domain_creation_date = self.get_single_date(whois_data.creation_date, newest = False)\n", + " domain_updated_date = self.get_single_date(whois_data.updated_date, newest = False)\n", + " domain_expiration_date = self.get_single_date(whois_data.expiration_date, newest = False)\n", + " \n", + " dict_data = {\n", + " 'domain_timestamps_relative':\n", + " {\n", + " 'current_date': (date_now.strftime(dateformat)),\n", + " 'created_days_ago': (date_now - domain_creation_date).days,\n", + " 'updated_days_ago': (date_now - domain_updated_date).days,\n", + " 'expires_days_left': (domain_expiration_date - date_now).days\n", + " }\n", + " }\n", + " \n", + " return dict_data\n", + "\n", + "######################################\n", + " \"\"\"\n", + " Determine whether URL matches syntaxes such as\n", + " '../foo/bar/'\n", + " '/foo/../../bar/,\n", + " 'https://foo.bar/foo/../'\n", + " \n", + " etc.\n", + " \n", + " Returns a boolean object.\n", + " \"\"\"\n", + " def is_multidot_url(self, url):\n", + " \n", + " multidot = re.compile(r\".*[.]{2}/.*\")\n", + " \n", + " if multidot.match(url):\n", + " return True\n", + " return False\n", + "\n", + "######################################\n", + " \"\"\"\n", + " Get HTML element data from HTML data contents.\n", + " \n", + " Two fetching methods are supported:\n", + " - A) use only HTML element/tag name and extract raw contents of\n", + " these tags\n", + " - B) use both HTML element/tag name and more fine-grained\n", + " inner attribute name to determine which HTML elements are extracted\n", + " \n", + " Special case - URL link references:\n", + " - attributes 'href' or 'src' are considered as link referrals and \n", + " they are handled in a special way\n", + " - A) link referrals to directly to domain are placed in 'self_refs' list\n", + " (patterns: '/', '#', '../' and '/')\n", + " - B) link referrals to external domains are placed in 'ext_refs' list\n", + " (patterns such as 'https://foo.bar.dot/fancysite' etc.)\n", + " \n", + " - Both A) and B) link categories have 'normal' and 'multidot' subcategories\n", + " - normal links do not contain pattern '../'\n", + " - multidot links contain '../' pattern\n", + " \n", + " Returns a dict object.\n", + " \"\"\"\n", + " \n", + " def get_tag_data(self, url, tag, attribute=None):\n", + " \n", + " html_data = self.get_html_data(url)\n", + " domain_name = self.get_domain_name(url)\n", + " data = []\n", + " \n", + " if attribute != None:\n", + " \n", + " for d in html_data.find_all(tag):\n", + " \n", + " # Ignore the HTML tag if it does not contain our attribute\n", + " if d.get(attribute) != None:\n", + " data.append(d.get(attribute))\n", + " \n", + " if attribute == 'href' or attribute == 'src':\n", + " \n", + " self_refs = { 'normal': [], 'multidot': []}\n", + " ext_refs = { 'normal': [], 'multidot': []}\n", + " \n", + " # Syntax: '#', '/', '../'\n", + " rs = re.compile(r\"^[/#]|^[.]{2}/.*\")\n", + " \n", + " # Syntax: ':/'\n", + " rd = re.compile(r\"^[a-z]+:[a-z]+/\")\n", + " \n", + " # Syntax examples:\n", + " # 'http://foo.bar/', 'https://foo.bar/, 'foo.bar/', 'https://virus.foo.bar/'\n", + " rl = re.compile(r\"^([a-z]+://)?([^/]*\" + domain_name + \"/)\")\n", + " \n", + " for s in data:\n", + " \n", + " # Ignore mailto links\n", + " if re.match(\"^mailto:\", s): continue\n", + " \n", + " if rs.match(s) or rl.match(s) or rd.match(s):\n", + " if self.is_multidot_url(s):\n", + " self_refs['multidot'].append(s)\n", + " else:\n", + " self_refs['normal'].append(s)\n", + " else:\n", + " \n", + " if self.is_multidot_url(s):\n", + " try:\n", + " ext_refs['multidot'].append({'url': s, 'registrar': self.get_whois_data(s).registrar })\n", + " except:\n", + " # Fallback if WHOIS query fails\n", + " ext_refs['normal'].append({'url': s, 'registrar': None })\n", + " pass\n", + " else:\n", + " try:\n", + " ext_refs['normal'].append({'url': s, 'registrar': self.get_whois_data(s).registrar })\n", + " except:\n", + " ext_refs['normal'].append({'url': s, 'registrar': None })\n", + " pass\n", + " \n", + " data = None\n", + " \n", + " dict_data = {\n", + " tag: {\n", + " attribute + '_ext': (ext_refs),\n", + " attribute + '_self': (self_refs)\n", + " }\n", + " }\n", + " \n", + " else:\n", + " dict_data = {\n", + " tag: {\n", + " attribute: (data)\n", + " }\n", + " }\n", + " \n", + " else:\n", + " for d in html_data.find_all(tag):\n", + " data.append(d.prettify())\n", + " \n", + " dict_data = {\n", + " tag: (data)\n", + " }\n", + " \n", + " return dict_data\n", + "\n", + "######################################\n", + " \"\"\"\n", + " How many external URL links have same registrar than\n", + " the webpage itself?\n", + " \"\"\"\n", + " def get_registrar_count(self, registrar, urls):\n", + " \n", + " i = 0\n", + " \n", + " for u in urls:\n", + " for k,v in u.items():\n", + " if k == 'registrar' and v == registrar:\n", + " i += 1\n", + " \n", + " o = len(urls) - i\n", + " \n", + " dict_data = {\n", + " 'same_registrar_count': i,\n", + " 'other_registrar_count': o\n", + " }\n", + " \n", + " return dict_data\n", + "\n", + "######################################\n", + "\n", + " \"\"\"\n", + " Get values existing in a dict object,\n", + " based on a known key string.\n", + " \n", + " Returns a list object.\n", + " \n", + " TODO: Major re-work for the fetch function\n", + "\n", + " TODO: Support for more sophisticated JSON key string filtering\n", + " (possibility to use multiple keys for filtering)\n", + " \"\"\"\n", + " class json_fetcher(object):\n", + "\n", + " def __init__(self, dict_data, json_key):\n", + " self.json_dict = json.loads(json.dumps(dict_data))\n", + " self.json_key = json_key\n", + "\n", + " ##########\n", + " # Ref: https://www.codespeedy.com/how-to-loop-through-json-with-subkeys-in-python/\n", + " def fetch(self, jdata):\n", + "\n", + " if isinstance(jdata, dict):\n", + "\n", + " for k,v in jdata.items():\n", + " if k == self.json_key:\n", + " yield v\n", + " elif isinstance(v, dict):\n", + " for val in self.fetch(v):\n", + " yield val\n", + " elif isinstance(v, list):\n", + " for l in v:\n", + " if isinstance(l, dict):\n", + " for ka,va in l.items():\n", + " if ka == self.json_key:\n", + " yield va\n", + "\n", + " elif isinstance(jdata, list):\n", + " for l in jdata:\n", + " if isinstance(l, dict):\n", + " for k,v in l.items():\n", + " if k == self.json_key:\n", + " yield v\n", + " elif isinstance(l, list):\n", + " for lb in v:\n", + " for ka,va in lb.items():\n", + " if ka == self.json_key:\n", + " yield va\n", + "\n", + " ##########\n", + " def get_data(self, flatten=True):\n", + "\n", + " data_extract = []\n", + " flat_data = []\n", + "\n", + " for i in self.fetch(self.json_dict):\n", + " data_extract.append(i)\n", + "\n", + " # Flatten possible nested lists\n", + " # (i.e. JSON data contains multiple keys in\n", + " # different nested sections)\n", + " def get_data_extract(ld):\n", + " for l in ld:\n", + " if isinstance(l, list):\n", + " for la in get_data_extract(l):\n", + " yield la\n", + " else:\n", + " yield l\n", + "\n", + " if flatten == True:\n", + " for u in get_data_extract(data_extract):\n", + " flat_data.append(u)\n", + " \n", + " return flat_data\n", + " else:\n", + " return data_extract\n", + "\n", + "######################################\n", + " \"\"\"\n", + " Compile URL related data.\n", + " \"\"\"\n", + " def get_url_data(self, url):\n", + " \n", + " # Dict object for simple, non-nested data\n", + " data_simple = {}\n", + "\n", + " # Pre-defined dict object for specific data sets\n", + " webpage_data = {}\n", + " \n", + " startfinal_url = self.get_startfinal_urls(url)\n", + " redirect_url = self.get_url_redirects(url)\n", + " domain_registrar = self.get_domain_registrar(url)\n", + " domaintitle_match = self.get_domain_title_match(url)\n", + " \n", + " domain_time_relative = self.get_domain_timeinfo_relative(url)\n", + " domain_time = self.get_domain_timeinfo(url)\n", + " \n", + " html_element_iframe = self.get_tag_data(url, 'iframe')\n", + " html_element_a_href = self.get_tag_data(url, 'a', link_refs['a'])\n", + " html_element_img_src = self.get_tag_data(url, 'img', link_refs['img'])\n", + " html_element_script_src = self.get_tag_data(url, 'script', link_refs['script'])\n", + "\n", + " iframes_count = {\n", + " 'iframes_count':\n", + " len(self.json_fetcher(html_element_iframe, 'iframe').get_data())\n", + " }\n", + " \n", + " multidot_urls_count = {\n", + " 'multidot_url_count':\n", + " len(self.json_fetcher(html_element_a_href, 'multidot').get_data()) + len(self.json_fetcher(html_element_img_src, 'multidot').get_data()) + len(self.json_fetcher(html_element_script_src, 'multidot').get_data())\n", + " }\n", + " \n", + " ###################\n", + " def get_total_registrars():\n", + "\n", + " same_registrar_counts = 0\n", + " other_registrar_counts = 0\n", + " for k,v in link_refs.items():\n", + " \n", + " html_element = self.get_tag_data(url, k, v)\n", + " \n", + " same_registrar_counts += self.get_registrar_count(\n", + " domain_registrar['domain_registrar'],\n", + " html_element[k][v + '_ext']['normal']\n", + " )['same_registrar_count']\n", + " \n", + " other_registrar_counts += self.get_registrar_count(\n", + " domain_registrar['domain_registrar'],\n", + " html_element[k][v + '_ext']['normal']\n", + " )['other_registrar_count']\n", + " \n", + " registrar_counts = {\n", + " 'same_registrar_count': same_registrar_counts,\n", + " 'other_registrar_count': other_registrar_counts\n", + " }\n", + " return registrar_counts\n", + " \n", + " # Avoid unnecessary nesting of the following data\n", + " data_simple.update(domain_registrar)\n", + " data_simple.update(domaintitle_match)\n", + " data_simple.update(iframes_count)\n", + " data_simple.update(multidot_urls_count)\n", + " data_simple.update(get_total_registrars())\n", + " \n", + " url_data = dict({\n", + " url: [\n", + " data_simple,\n", + " startfinal_url,\n", + " {'redirects': redirect_url},\n", + " \n", + " domain_time_relative,\n", + " domain_time,\n", + " \n", + " {'webpage_data': [\n", + " html_element_iframe,\n", + " html_element_a_href,\n", + " html_element_img_src,\n", + " html_element_script_src\n", + " ]\n", + " }\n", + " ]\n", + " })\n", + " \n", + " return url_data\n", + "\n", + "\n", + "\n", + "class write_operations(object):\n", + "\n", + " def __init__(self):\n", + " self.filename = filename\n", + "\n", + "######################################\n", + " \"\"\"\n", + " Set JSON file name, append number suffix\n", + " # if file exists already.\n", + " \n", + " Returns file name path.\n", + " \"\"\"\n", + " def set_filename(self):\n", + " \n", + " c = 0\n", + " while True:\n", + " if os.path.exists(self.filename):\n", + " if c == 0:\n", + " self.filename = self.filename + \".\" + str(c)\n", + " else:\n", + " self.filename = re.sub(\"[0-9]+$\", str(c), self.filename)\n", + " else:\n", + " break\n", + " c += 1\n", + " return self.filename\n", + "\n", + "######################################\n", + " \"\"\"\n", + " Append to a JSON file.\n", + " \"\"\"\n", + " def write_to_file(self, data):\n", + " \n", + " try:\n", + " json_file = open(self.filename, \"a\")\n", + " json_file.write(data)\n", + " json_file.close()\n", + " return 0\n", + " except:\n", + " return 1\n", + "\n", + "######################################\n", + " \"\"\"\n", + " Fetch all pre-defined URLs.\n", + " \"\"\"\n", + " def fetch_and_store_url_data(self, urls, use_file):\n", + "\n", + " data_parts = {}\n", + " fetch_json_data = json_url_data()\n", + "\n", + " for u in urls:\n", + " print(\"Fetching URL data: %s\" % u)\n", + " try:\n", + " data_parts.update(fetch_json_data.get_url_data(u))\n", + " except:\n", + " print(\"Failed: %s\" % u)\n", + " pass\n", + "\n", + " json_data = json.dumps(data_parts)\n", + "\n", + " if use_file == True:\n", + " self.write_to_file(json_data)\n", + "\n", + " return json_data\n", + "\n", + "######################################\n", + "\"\"\"\n", + "Visualize & summarize data.\n", + "\"\"\"\n", + "\n", + "class data_visualization(object):\n", + "\n", + " def __init__(self, url, json_data):\n", + " self.url = url\n", + " self.json_data = json_data\n", + "\n", + " self.data = json.loads(json.dumps(self.json_data)).get(self.url)\n", + " self.json_url_obj = json_url_data()\n", + " self.domain_registrar = self.json_url_obj.get_domain_registrar(self.url)['domain_registrar']\n", + " self.webpage_data = self.json_url_obj.json_fetcher(self.data, 'webpage_data').get_data()\n", + "\n", + " def get_urls_count_summary(self):\n", + "\n", + " unique_refs = []\n", + "\n", + " for k,v in link_refs.items():\n", + " if v in unique_refs: continue\n", + " unique_refs.append(v)\n", + "\n", + " def link_count(refs, suffix):\n", + "\n", + " urls_cnt = 0\n", + "\n", + " for u in self.webpage_data:\n", + " for l in refs:\n", + " urls = self.json_url_obj.json_fetcher(u, l + suffix).get_data()\n", + " for n in urls:\n", + " urls_cnt += len(n['normal'])\n", + " urls_cnt += len(n['multidot'])\n", + " return urls_cnt\n", + "\n", + " data = {\n", + " 'local_urls': link_count(unique_refs, '_self'),\n", + " 'external_urls': link_count(unique_refs, '_ext')\n", + " }\n", + " \n", + " return data\n", + "\n", + " def get_registrars(self):\n", + "\n", + " registrars = []\n", + " #registrars.append(self.domain_registrar)\n", + "\n", + " for w in self.webpage_data:\n", + " webpage_registrars = self.json_url_obj.json_fetcher(w, 'registrar').get_data()\n", + " for wa in webpage_registrars:\n", + " if wa != None:\n", + " registrars.append(wa)\n", + " return registrars\n", + "\n", + " def get_registrar_count_summary(self):\n", + " \n", + " domain_counter = dict(Counter(self.get_registrars()))\n", + " data = {'fetched_domains': domain_counter, 'url_domain_registrar': self.domain_registrar }\n", + " return data\n", + "\n", + "######################################\n", + "\"\"\"\n", + "Execute the main program code.\n", + "\n", + "TODO: this code must figure out the correct JSON file\n", + "if multiple generated files are present.\n", + "\"\"\"\n", + "if __name__ == '__main__':\n", + "\n", + " if plot_only == False:\n", + " write_obj = write_operations()\n", + " write_obj.set_filename()\n", + " data = write_obj.fetch_and_store_url_data(urls, use_file)\n", + "\n", + " url_str_pattern = re.compile(r\"(^[a-z]+://)?([^/]*)\")\n", + "\n", + " if os.path.exists(filename):\n", + " with open(filename, \"r\") as json_file:\n", + " json_data = json.load(json_file)\n", + " else:\n", + " json_data = data\n", + "\n", + " # Get URLs from an available JSON data\n", + " for key_url in json_data.keys():\n", + " \n", + " print(\"Generating statistics: %s\" % key_url)\n", + "\n", + " fig = plt.figure()\n", + " fig_params = {\n", + " 'xtick.labelsize': 8,\n", + " 'figure.figsize': [9,8]\n", + " # 'figure.constrained_layout.use': True\n", + " }\n", + " plt.rcParams.update(fig_params)\n", + " \n", + " domain_string = url_str_pattern.split(key_url)[2].replace('.','')\n", + " summary = data_visualization(key_url, json_data)\n", + " \n", + " summary_registrars = summary.get_registrar_count_summary()['fetched_domains']\n", + "\n", + " x_r = list(summary_registrars.keys())\n", + " y_r = list(summary_registrars.values())\n", + " \n", + " # Show bar values\n", + " for index,data in enumerate(y_r):\n", + " plt.text(x=index, y=data+0.5, s=data, fontdict=dict(fontsize=8))\n", + " \n", + " title_r = \"Domains associated with HTML URL data (\" + key_url + \")\"\n", + " xlabel_r = \"Fetched domains\"\n", + " ylabel_r = \"Domain count\"\n", + "\n", + " plt.bar(x_r, y_r, color=\"green\", edgecolor=\"black\")\n", + " plt.title(title_r)\n", + " plt.xlabel(xlabel_r)\n", + " plt.ylabel(ylabel_r)\n", + " plt.xticks(rotation=45, horizontalalignment=\"right\")\n", + "\n", + " if save_plot_images == True:\n", + " plt.savefig(os.getcwd() + \"/\" + \"domain_figure_\" + domain_string + \".png\", dpi=plot_images_dpi)\n", + " plt.show()\n", + "\n", + " #fig_u = plt.figure()\n", + " \n", + " #summary_urls = summary.get_urls_count_summary()\n", + " \n", + " #x_u = list(summary_urls.keys())\n", + " #y_u = list(summary_urls.values())\n", + " #title_u = \"Local and external URL references (\" + key_url + \")\"\n", + " #xlabel_u = \"Fetched URLs\"\n", + " #ylabel_u = \"URL count\"\n", + " \n", + " #plt.bar(x_u, y_u, color=\"blue\", edgecolor='black')\n", + " #plt.title(title_u)\n", + " #plt.xlabel(xlabel_u)\n", + " #plt.ylabel(ylabel_u)\n", + " #plt.show()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.5" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/code/url-analyzer.py b/code/url-analyzer.py new file mode 100644 index 0000000..7201910 --- /dev/null +++ b/code/url-analyzer.py @@ -0,0 +1,862 @@ +#!/bin/env python + +""" +URL data extractor + +Pekka Helenius + +Requirements: + +Python 3 +Python 3 BeautifulSoup4 (python-beautifulsoup4) +Python 3 whois (python-whois; PyPI) +Python 3 JSON Schema (python-jsonschema) +Python 3 Numpy (python-numpy) +Python 3 matplotlib (python-matplotlib) + +TODO: URL domain part length comparison analysis +TODO: URL non-TLD part length comparison analysis + - in phishing webpages, URL tends to be much longer than legitimate webpages + however, domains themselves tend to be much shorter (without TLD) + - phishing URLs often contain more number of dots and subdomains than legitimate URLs + - legitimate: robots.txt redirects bots to a legitimate domain rather than to the original phishing domain + +TODO: Website visual similarity analysis +TODO: consistency of RDN usage in HTML data +""" + +###################################### + +#%matplotlib notebook +import matplotlib.pyplot as plt + +from bs4 import BeautifulSoup as bs +from collections import Counter +from datetime import date, datetime +import json +import os +import re +import requests +from time import sleep +import urllib +from whois import whois + +# Target URLs +urls = [ + "https://hoxhunt.com/", + "https://hs.fi", + "https://ts.fi", + "https://facebook.com" +] + +# Some web servers may block our request unless we set a widely used, well-known user agent string +request_headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36' +} + +# Date format for domain timestamps +dateformat = "%Y/%m/%d" + +# All webpages may not like fetching data too fast +# Sleep time in seconds +sleep_interval_between_requests = 0.5 + +# Write JSON results to a file? +use_file = True +# Full file path + name +filename = os.getcwd() + "/" + "url_info.json" + +# Generate plot from existing JSON data? +plot_only = False + +# Save generated plot images? +save_plot_images = True + +# DPI of plot images +plot_images_dpi = 150 + +# Common link attribute references in various HTML elements +link_refs = { + 'a': 'href', + 'img': 'src', + 'script': 'src' +} + +############################################################################ +############################################################################ + +class json_url_data(object): + +# def __init__(self): + +###################################### + """ + Set a new HTTP session and get response. + + Returns a requests.models.Response object. + """ + def set_session(self, url, method='get', redirects=True): + + # HTTP response status codes 1XX, 2XX and 3XX are OK + # Treat other codes as errors + sc = re.compile(r"^[123]{1}[0-9]{2}") + + sleep(sleep_interval_between_requests) + + try: + session = requests.Session() + response = session.request(method, url, headers=request_headers, allow_redirects=redirects) + + if not sc.match(str(response.status_code)): + raise Exception("Error: got invalid response status from the web server") + return response + + except: + raise Exception("Error: HTTP session could not be established. URL: '" + url + "' (method: " + method + ")") from None + +###################################### + """ + Fetch HTML data. + + Returns a bs4.BeautifulSoup object. + """ + def get_html_data(self, url): + + try: + data = bs(self.set_session(url).content, 'html.parser') + return data + except: + raise Exception("Error: HTML data could not be retrieved") + +###################################### + """ + Get URL redirects and related HTTP status codes. + + Returns a list object. + """ + def get_url_redirects(self, url): + + response = self.set_session(url) + list_data = [] + + if response.history: + + for r in response.history: + list_data.append({'redirect_url': r.url, 'status': r.status_code}) + + return list_data + +###################################### + """ + Extract title HTML element contents from given HTML data. + + Returns a string object. + """ + def get_webpage_title(self, url): + + html_data = self.get_html_data(url) + + title = html_data.title.string + return title + +###################################### + """ + Get WHOIS domain data. + + Returns a dict object. + """ + def get_whois_data(self, url): + dict_data = whois(url) + return dict_data + +###################################### + """ + Get domain name based on WHOIS domain data. + """ + def get_domain_name(self, url): + domain_name = self.get_whois_data(url).domain_name + + if type(domain_name) is list: + return domain_name[0].lower() + else: + return domain_name.lower() + +###################################### + """ + Get initial and final URLs + + Compare whether the final (destination) URL + matches with the initial URL in a request. + + Returns a dict object. + """ + def get_startfinal_urls(self, url): + + response = self.set_session(url) + end_url = response.url + + start_match = False + final_match = False + + # dr = re.compile(r"^([a-z]+://)?([^/]+)") + # dr_group_lastindex = dr.match(url).lastindex + # domain_name = dr.match(url).group(dr_group_lastindex) + + domain_name = self.get_domain_name(url) + + if re.search(domain_name, end_url): + final_match = True + + dict_data = { + 'startfinal_urls': { + 'start_url': { + 'url': url + }, + 'final_url': { + 'url': end_url, 'domain_match': final_match + } + } + } + + return dict_data + +###################################### + """ + Get domain registrar + + Returns a dict object. + """ + def get_domain_registrar(self, url): + dict_data = {'domain_registrar': self.get_whois_data(url).registrar } + return dict_data + +###################################### + """ + Do comparison between the domain name, extracted + from WHOIS domain data and contents of a title HTML + element, extracted from HTML data based on a given URL. + + Returns a dict object. + """ + def get_domain_title_match(self, url): + + domain_name = self.get_domain_name(url) + title = self.get_webpage_title(url) + + # If is string: + if type(domain_name) is str: + if re.search(domain_name, title, re.IGNORECASE): + match = True + else: + match = False + + # If is list: + elif type(domain_name) is list: + for d in domain_name: + if re.search(d, title, re.IGNORECASE): + match = True + break + else: + match = False + else: + match = False + + dict_data = { + 'webpage_title': title, + 'domain_in_webpage_title': match + } + + return dict_data + +###################################### + """ + Get a single timestamp from given data + + Two scenarios are considered: dates argument is either + a list or a string. If it is a list, then we need + to decide which date value to extract. + + Returns a date object. + """ + def get_single_date(self, dates, newest=False): + + dates_epoch = [] + + if type(dates) is list: + for d in dates: + dates_epoch.append(d.timestamp()) + else: + dates_epoch.append(dates.timestamp()) + + return datetime.fromtimestamp(sorted(dates_epoch, reverse=newest)[0]) + +###################################### + """ + Get domain time information based on WHOIS domain data. + + Returns a dict object. + """ + def get_domain_timeinfo(self, url): + + whois_data = self.get_whois_data(url) + domain_creation_date = self.get_single_date(whois_data.creation_date, newest = False) + domain_updated_date = self.get_single_date(whois_data.updated_date, newest = False) + domain_expiration_date = self.get_single_date(whois_data.expiration_date, newest = False) + + dict_data = { + 'domain_timestamps': + { + 'created': domain_creation_date.strftime(dateformat), + 'updated': domain_updated_date.strftime(dateformat), + 'expires': domain_expiration_date.strftime(dateformat) + } + } + + return dict_data + +###################################### + """ + Get domain time information based on WHOIS domain data, + relative to the current date (UTC time). + + Returns a dict object. + """ + def get_domain_timeinfo_relative(self, url): + + date_now = datetime.utcnow() + + whois_data = self.get_whois_data(url) + domain_creation_date = self.get_single_date(whois_data.creation_date, newest = False) + domain_updated_date = self.get_single_date(whois_data.updated_date, newest = False) + domain_expiration_date = self.get_single_date(whois_data.expiration_date, newest = False) + + dict_data = { + 'domain_timestamps_relative': + { + 'current_date': (date_now.strftime(dateformat)), + 'created_days_ago': (date_now - domain_creation_date).days, + 'updated_days_ago': (date_now - domain_updated_date).days, + 'expires_days_left': (domain_expiration_date - date_now).days + } + } + + return dict_data + +###################################### + """ + Determine whether URL matches syntaxes such as + '../foo/bar/' + '/foo/../../bar/, + 'https://foo.bar/foo/../' + + etc. + + Returns a boolean object. + """ + def is_multidot_url(self, url): + + multidot = re.compile(r".*[.]{2}/.*") + + if multidot.match(url): + return True + return False + +###################################### + """ + Get HTML element data from HTML data contents. + + Two fetching methods are supported: + - A) use only HTML element/tag name and extract raw contents of + these tags + - B) use both HTML element/tag name and more fine-grained + inner attribute name to determine which HTML elements are extracted + + Special case - URL link references: + - attributes 'href' or 'src' are considered as link referrals and + they are handled in a special way + - A) link referrals to directly to domain are placed in 'self_refs' list + (patterns: '/', '#', '../' and '/') + - B) link referrals to external domains are placed in 'ext_refs' list + (patterns such as 'https://foo.bar.dot/fancysite' etc.) + + - Both A) and B) link categories have 'normal' and 'multidot' subcategories + - normal links do not contain pattern '../' + - multidot links contain '../' pattern + + Returns a dict object. + """ + + def get_tag_data(self, url, tag, attribute=None): + + html_data = self.get_html_data(url) + domain_name = self.get_domain_name(url) + data = [] + + if attribute != None: + + for d in html_data.find_all(tag): + + # Ignore the HTML tag if it does not contain our attribute + if d.get(attribute) != None: + data.append(d.get(attribute)) + + if attribute == 'href' or attribute == 'src': + + self_refs = { 'normal': [], 'multidot': []} + ext_refs = { 'normal': [], 'multidot': []} + + # Syntax: '#', '/', '../' + rs = re.compile(r"^[/#]|^[.]{2}/.*") + + # Syntax: ':/' + rd = re.compile(r"^[a-z]+:[a-z]+/") + + # Syntax examples: + # 'http://foo.bar/', 'https://foo.bar/, 'foo.bar/', 'https://virus.foo.bar/' + rl = re.compile(r"^([a-z]+://)?([^/]*" + domain_name + "/)") + + for s in data: + + # Ignore mailto links + if re.match("^mailto:", s): continue + + if rs.match(s) or rl.match(s) or rd.match(s): + if self.is_multidot_url(s): + self_refs['multidot'].append(s) + else: + self_refs['normal'].append(s) + else: + + if self.is_multidot_url(s): + try: + ext_refs['multidot'].append({'url': s, 'registrar': self.get_whois_data(s).registrar }) + except: + # Fallback if WHOIS query fails + ext_refs['normal'].append({'url': s, 'registrar': None }) + pass + else: + try: + ext_refs['normal'].append({'url': s, 'registrar': self.get_whois_data(s).registrar }) + except: + ext_refs['normal'].append({'url': s, 'registrar': None }) + pass + + data = None + + dict_data = { + tag: { + attribute + '_ext': (ext_refs), + attribute + '_self': (self_refs) + } + } + + else: + dict_data = { + tag: { + attribute: (data) + } + } + + else: + for d in html_data.find_all(tag): + data.append(d.prettify()) + + dict_data = { + tag: (data) + } + + return dict_data + +###################################### + """ + How many external URL links have same registrar than + the webpage itself? + """ + def get_registrar_count(self, registrar, urls): + + i = 0 + + for u in urls: + for k,v in u.items(): + if k == 'registrar' and v == registrar: + i += 1 + + o = len(urls) - i + + dict_data = { + 'same_registrar_count': i, + 'other_registrar_count': o + } + + return dict_data + +###################################### + + """ + Get values existing in a dict object, + based on a known key string. + + Returns a list object. + + TODO: Major re-work for the fetch function + + TODO: Support for more sophisticated JSON key string filtering + (possibility to use multiple keys for filtering) + """ + class json_fetcher(object): + + def __init__(self, dict_data, json_key): + self.json_dict = json.loads(json.dumps(dict_data)) + self.json_key = json_key + + ########## + # Ref: https://www.codespeedy.com/how-to-loop-through-json-with-subkeys-in-python/ + def fetch(self, jdata): + + if isinstance(jdata, dict): + + for k,v in jdata.items(): + if k == self.json_key: + yield v + elif isinstance(v, dict): + for val in self.fetch(v): + yield val + elif isinstance(v, list): + for l in v: + if isinstance(l, dict): + for ka,va in l.items(): + if ka == self.json_key: + yield va + + elif isinstance(jdata, list): + for l in jdata: + if isinstance(l, dict): + for k,v in l.items(): + if k == self.json_key: + yield v + elif isinstance(l, list): + for lb in v: + for ka,va in lb.items(): + if ka == self.json_key: + yield va + + ########## + def get_data(self, flatten=True): + + data_extract = [] + flat_data = [] + + for i in self.fetch(self.json_dict): + data_extract.append(i) + + # Flatten possible nested lists + # (i.e. JSON data contains multiple keys in + # different nested sections) + def get_data_extract(ld): + for l in ld: + if isinstance(l, list): + for la in get_data_extract(l): + yield la + else: + yield l + + if flatten == True: + for u in get_data_extract(data_extract): + flat_data.append(u) + + return flat_data + else: + return data_extract + +###################################### + """ + Compile URL related data. + """ + def get_url_data(self, url): + + # Dict object for simple, non-nested data + data_simple = {} + + # Pre-defined dict object for specific data sets + webpage_data = {} + + startfinal_url = self.get_startfinal_urls(url) + redirect_url = self.get_url_redirects(url) + domain_registrar = self.get_domain_registrar(url) + domaintitle_match = self.get_domain_title_match(url) + + domain_time_relative = self.get_domain_timeinfo_relative(url) + domain_time = self.get_domain_timeinfo(url) + + html_element_iframe = self.get_tag_data(url, 'iframe') + html_element_a_href = self.get_tag_data(url, 'a', link_refs['a']) + html_element_img_src = self.get_tag_data(url, 'img', link_refs['img']) + html_element_script_src = self.get_tag_data(url, 'script', link_refs['script']) + + iframes_count = { + 'iframes_count': + len(self.json_fetcher(html_element_iframe, 'iframe').get_data()) + } + + multidot_urls_count = { + 'multidot_url_count': + len(self.json_fetcher(html_element_a_href, 'multidot').get_data()) + len(self.json_fetcher(html_element_img_src, 'multidot').get_data()) + len(self.json_fetcher(html_element_script_src, 'multidot').get_data()) + } + + ################### + def get_total_registrars(): + + same_registrar_counts = 0 + other_registrar_counts = 0 + for k,v in link_refs.items(): + + html_element = self.get_tag_data(url, k, v) + + same_registrar_counts += self.get_registrar_count( + domain_registrar['domain_registrar'], + html_element[k][v + '_ext']['normal'] + )['same_registrar_count'] + + other_registrar_counts += self.get_registrar_count( + domain_registrar['domain_registrar'], + html_element[k][v + '_ext']['normal'] + )['other_registrar_count'] + + registrar_counts = { + 'same_registrar_count': same_registrar_counts, + 'other_registrar_count': other_registrar_counts + } + return registrar_counts + + # Avoid unnecessary nesting of the following data + data_simple.update(domain_registrar) + data_simple.update(domaintitle_match) + data_simple.update(iframes_count) + data_simple.update(multidot_urls_count) + data_simple.update(get_total_registrars()) + + url_data = dict({ + url: [ + data_simple, + startfinal_url, + {'redirects': redirect_url}, + + domain_time_relative, + domain_time, + + {'webpage_data': [ + html_element_iframe, + html_element_a_href, + html_element_img_src, + html_element_script_src + ] + } + ] + }) + + return url_data + + + +class write_operations(object): + + def __init__(self): + self.filename = filename + +###################################### + """ + Set JSON file name, append number suffix + # if file exists already. + + Returns file name path. + """ + def set_filename(self): + + c = 0 + while True: + if os.path.exists(self.filename): + if c == 0: + self.filename = self.filename + "." + str(c) + else: + self.filename = re.sub("[0-9]+$", str(c), self.filename) + else: + break + c += 1 + return self.filename + +###################################### + """ + Append to a JSON file. + """ + def write_to_file(self, data): + + try: + json_file = open(self.filename, "a") + json_file.write(data) + json_file.close() + return 0 + except: + return 1 + +###################################### + """ + Fetch all pre-defined URLs. + """ + def fetch_and_store_url_data(self, urls, use_file): + + data_parts = {} + fetch_json_data = json_url_data() + + for u in urls: + print("Fetching URL data: %s" % u) + try: + data_parts.update(fetch_json_data.get_url_data(u)) + except: + print("Failed: %s" % u) + pass + + json_data = json.dumps(data_parts) + + if use_file == True: + self.write_to_file(json_data) + + return json_data + +###################################### +""" +Visualize & summarize data. +""" + +class data_visualization(object): + + def __init__(self, url, json_data): + self.url = url + self.json_data = json_data + + self.data = json.loads(json.dumps(self.json_data)).get(self.url) + self.json_url_obj = json_url_data() + self.domain_registrar = self.json_url_obj.get_domain_registrar(self.url)['domain_registrar'] + self.webpage_data = self.json_url_obj.json_fetcher(self.data, 'webpage_data').get_data() + + def get_urls_count_summary(self): + + unique_refs = [] + + for k,v in link_refs.items(): + if v in unique_refs: continue + unique_refs.append(v) + + def link_count(refs, suffix): + + urls_cnt = 0 + + for u in self.webpage_data: + for l in refs: + urls = self.json_url_obj.json_fetcher(u, l + suffix).get_data() + for n in urls: + urls_cnt += len(n['normal']) + urls_cnt += len(n['multidot']) + return urls_cnt + + data = { + 'local_urls': link_count(unique_refs, '_self'), + 'external_urls': link_count(unique_refs, '_ext') + } + + return data + + def get_registrars(self): + + registrars = [] + #registrars.append(self.domain_registrar) + + for w in self.webpage_data: + webpage_registrars = self.json_url_obj.json_fetcher(w, 'registrar').get_data() + for wa in webpage_registrars: + if wa != None: + registrars.append(wa) + return registrars + + def get_registrar_count_summary(self): + + domain_counter = dict(Counter(self.get_registrars())) + data = {'fetched_domains': domain_counter, 'url_domain_registrar': self.domain_registrar } + return data + +###################################### +""" +Execute the main program code. + +TODO: this code must figure out the correct JSON file +if multiple generated files are present. +""" +if __name__ == '__main__': + + if plot_only == False: + write_obj = write_operations() + write_obj.set_filename() + data = write_obj.fetch_and_store_url_data(urls, use_file) + + url_str_pattern = re.compile(r"(^[a-z]+://)?([^/]*)") + + if os.path.exists(filename): + with open(filename, "r") as json_file: + json_data = json.load(json_file) + else: + json_data = data + + # Get URLs from an available JSON data + for key_url in json_data.keys(): + + print("Generating statistics: %s" % key_url) + + fig = plt.figure() + fig_params = { + 'xtick.labelsize': 8, + 'figure.figsize': [9,8] + # 'figure.constrained_layout.use': True + } + plt.rcParams.update(fig_params) + + domain_string = url_str_pattern.split(key_url)[2].replace('.','') + summary = data_visualization(key_url, json_data) + + summary_registrars = summary.get_registrar_count_summary()['fetched_domains'] + + x_r = list(summary_registrars.keys()) + y_r = list(summary_registrars.values()) + + # Show bar values + for index,data in enumerate(y_r): + plt.text(x=index, y=data+0.5, s=data, fontdict=dict(fontsize=8)) + + title_r = "Domains associated with HTML URL data (" + key_url + ")" + xlabel_r = "Fetched domains" + ylabel_r = "Domain count" + + plt.bar(x_r, y_r, color="green", edgecolor="black") + plt.title(title_r) + plt.xlabel(xlabel_r) + plt.ylabel(ylabel_r) + plt.xticks(rotation=45, horizontalalignment="right") + + if save_plot_images == True: + plt.savefig(os.getcwd() + "/" + "domain_figure_" + domain_string + ".png", dpi=plot_images_dpi) + plt.show() + + #fig_u = plt.figure() + + #summary_urls = summary.get_urls_count_summary() + + #x_u = list(summary_urls.keys()) + #y_u = list(summary_urls.values()) + #title_u = "Local and external URL references (" + key_url + ")" + #xlabel_u = "Fetched URLs" + #ylabel_u = "URL count" + + #plt.bar(x_u, y_u, color="blue", edgecolor='black') + #plt.title(title_u) + #plt.xlabel(xlabel_u) + #plt.ylabel(ylabel_u) + #plt.show() +