diff --git a/README.md b/README.md index 488aef5..535eec1 100644 --- a/README.md +++ b/README.md @@ -6,6 +6,41 @@ Simple JPEG2000 GML data parser / extractor, _without additional tools such as G ## Usage +File: [gmlparser.py](data/gmlparser.py) + +``` +usage: gmlparser.py [-h] [-i [INPUTFILE]] [-f [OUTPUTFORMAT]] + [-o [OUTPUTFILE]] [-l [FORMATTING]] + +optional arguments: + -h, --help show this help message and exit + -i [INPUTFILE], --input [INPUTFILE] + Input JPEG2000 image file + -f [OUTPUTFORMAT], --dataformat [OUTPUTFORMAT] + Output format (Default: xml; Available: xml | json | + [tfw|worldfile] | info) + -o [OUTPUTFILE], --output [OUTPUTFILE] + Output file name + -l [FORMATTING], --formatting [FORMATTING] + Data formatting (Default: raw; Available: raw | + pretty) + +``` + +### Examples (commands + output): + +**JSON:** + +![](images/sample_json.png) + +**XML:** + +![](images/sample_xml.png) + +**TFW + info:** + +![](images/sample_tfw_info.png) + ## License This repository uses GPLv3 license. See [LICENSE](LICENSE) for details. diff --git a/data/gmlparser.py b/data/gmlparser.py new file mode 100644 index 0000000..264c447 --- /dev/null +++ b/data/gmlparser.py @@ -0,0 +1,640 @@ +#!/usr/bin/env python3 + +# Simple JPEG2000 GML data parser +# Copyright (C) 2019 Pekka Helenius +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +################################################################ + +import sys +from warnings import warn as Warn +import os.path +import argparse +import re +import xmltodict +import json +import urllib.request as URL +import math +# TODO import csv + +# TODO retrieve place name by using metadata coordinates & ref.system info +# requires internet connection and connection to a valid server +# +# TODO rename JPEG2000 file based on metadata entries, syntax given by user +# +# TODO fix tfw export for JPEG2000 files + +################################################################ +# +# INPUT ARGUMENTS + +argparser = argparse.ArgumentParser() + +argparser.add_argument('-i', '--input', help = 'Input JPEG2000 image file', nargs = '?', dest = 'inputfile') +argparser.add_argument('-f', '--dataformat', help = 'Output format (Default: xml; Available: xml | json | [tfw|worldfile] | info)', nargs = '?', dest = 'outputformat') +argparser.add_argument('-o', '--output', help = 'Output file name', nargs = '?', dest = 'outputfile') +argparser.add_argument('-l', '--formatting', help = 'Data formatting (Default: raw; Available: raw, pretty)', nargs = '?', dest = 'formatting') + +args = argparser.parse_args() + +# Formatting defaults to pretty format +# +if args.formatting is None: + args.formatting = 'pretty' + +################################################################ + +if not len(sys.argv) > 1: + argparser.print_help() + exit(0) + +if not args.inputfile.endswith('.jp2'): + Warn("Warning: Not a valid JPEG2000 file suffix") + +if args.outputformat is None: + raise ValueError("Error: No output format or file specified") + +elif args.outputformat not in ('json', 'xml', 'tfw', 'worldfile', 'info'): + raise ValueError("Error: Not a valid output format") + +################################################################ +# +# JPEG2000 CHECK STRINGS + +# Look for these jp2 strings in file header +# +jp2_header_str = ['ftypjp2', 'jp2 jpx', 'jp2', 'jp2h', 'jp2 J2P1'] + +# First string to look for in the selected jp2 file +# Basically, we are looking for the start of footer of the file +# which is indicated by this string +# +# We need to convert the string into bytes with encode method +# for the following while loop. +# +mdata_start_str = [ str.encode('gml.data'), str.encode('gxml'), str.encode('fxml') ] +mdata_end_str = str.encode('uuid') + +################################################################ +# +# OPEN JPEG2000 file + +# Open the image file in read-only binary mode +with open(args.inputfile, 'rb') as f: + +################################################################ +# +# JPEG2000 header check + + # Check for the first 4 file lines + header_lines = f.readlines()[0:3] + + # Declare a variable to store header text string + header_str = '' + + # For each header line 1-4... + for a in header_lines: + + # Decode binary formatted string (bytes -> string conversion) and ignore any errors we encounter + header_decode = a.decode('utf-8', errors='ignore') + + # Store decoded string into header_str variable + header_str += header_decode + + # Check existence of each jp2 specific string we have defined in 'jp2_header_str' list above + for jp2_str in jp2_header_str: + + # If a jp2 specific string is found, set up a new variable t and break the for loop + if jp2_str in header_str: + t = '' + break + + # Variable t is not defined if any valid jp2 string is not found. Thus, this test gives + # us and exception (NameError) if no any jp2 string is found. + try: + t + except NameError: + raise ValueError("Error: Not a valid JPEG2000 file") + +################################################################ +# +# PARSE METADATA LINES + + # Enumerate all lines, look for metadata start line, using + # string 'mdata_start_str' as a reference + # Break the loop when found. If not found, abort. + # + # Return to the first line again in order to parse footer lines + f.seek(0) + + for mstart_num, mstart_line in enumerate(f): + # TODO better formatting for this if statement: + if mdata_start_str[0] in mstart_line or mdata_start_str[1] in mstart_line or mdata_start_str[2] in mstart_line: + break + # else + # TODO echo cannot found metadata start. Abort + + # TODO should return value 2 + #print(mstart_num) + #sys.exit() + + # Enumerate all lines, look for metadata end line, using + # string 'mdata_end_str' as a reference + # Break the loop when found. If not found, abort. + # + f.seek(0) + for mend_num, mend_line in enumerate(f): + if mdata_end_str in mend_line: + break + # else + # TODO echo cannot found metadata end. Abort + + # Reset readlines + # + # Convert metadata start line from 'str' type to 'list' with split method + # and merge it with the rest of the metadata lines, defined by readlines method. + # Type of this line list is 'list', thus we use + operator to combine these + # lists. + # + f.seek(0) + metadata_lines = mstart_line.split() + f.readlines()[mstart_num:mend_num] + #mdata_lines = mstart_line.split() + f.readlines() + + # Create a new metadata_str variable where we will store our extracted footer strings. + metadata_str = '' + for byteline in metadata_lines: + + # Try decode each metadata line to UTF-8 format. + # As these lines are binary code, the conversion will fail for some + # of them. In a case of failure, we let the for loop pass to the next + # line + # + # Add each decoded line into 'footer_str' variable + # + try: + byteline_decoded = byteline.decode('utf-8', errors='strict') + metadata_str += byteline_decoded + + except Exception: + pass + + f.close() + metadata_xml_all = re.sub(r'(^[^<]*)|([^>]$)', '', metadata_str) + + # Create a list element from extracted metadata strings + metadata_xml_all_list = metadata_xml_all.split() + + # Find the last element containing <> symbols in metadata_xml_list, + # get the string between them and store it to new variable + for i in reversed(metadata_xml_all_list): + if re.match('', i): + last_tag = re.sub('', '', i) + break + + # In the original metadata list, find the first occurence of the 'last_tag' + for firstxml_index, value in enumerate(metadata_xml_all_list): + if re.match('<' + last_tag + '>?', value): + break + + # For joined metadata list, delete all list entries presented before our 'last_tag' + # Convert list to string format + metadata_parsed_list = metadata_xml_all_list[firstxml_index:] + metadata_joined_list = ' '.join(metadata_parsed_list) + +################################################################ +# + +class GMLDataParser(object): + + def __init__(self, datalist): + self.datalist = datalist + + def xmlraw(self): + return xmltodict.parse(self.datalist) + + def xmlpretty(self): + return xmltodict.unparse(xmltodict.parse(self.datalist), + pretty=True,indent=" ",newl="\n") + + def jsonraw(self): + return json.dumps(xmltodict.parse(self.datalist), + separators=(',', ':')) + + def jsonpretty(self): + return json.dumps(xmltodict.parse(self.datalist), + indent=2, sort_keys=True) + + # Convert GML metadata to JSON tree object + def jsontree(self): + return json.loads(self.jsonpretty()) + + # Function to get nested key values from JSON data + # by arainchi + # https://stackoverflow.com/a/19871956 + def findkey(self, tree, keyvalue): + if isinstance(tree, list): + for i in tree: + for x in self.findkey(i, keyvalue): + yield x + elif isinstance(tree, dict): + if keyvalue in tree: + yield tree[keyvalue] + for j in tree.values(): + for x in self.findkey(j, keyvalue): + yield x + +gmlparser = GMLDataParser(metadata_joined_list) +gml_json = gmlparser.jsontree() + +def findgmlkey(data, gmlkey, num): + try: + return list(gmlparser.findkey(data, gmlkey))[num] + except: + # In a case we can't parse GML data for this element, return string 'Unknown' + return str("Unknown") + +################################################################ +# +# Extract relevant values for TFW file/Worldfile + +class GML_Pos_offsetVectors(): + + # Sample metadata structure of JPEG2000 files (may differ!): + + # offsetVector_1 and offsetVector_2: + # + # gml:FeatureCollection + # gml:featureMember + # gml:FeatureCollection + # gml:featureMember + # gml:RectifiedGridCoverage + # gml:rectifiedGridDomain + # gml:RectifiedGrid + # gml:offsetVector[0] + # #text + # gml:offsetVector[1] + # #text + # + # gml_pos: + # + # gml:FeatureCollection + # gml:featureMember + # gml:FeatureCollection + # gml:featureMember + # gml:RectifiedGridCoverage + # gml:rectifiedGridDomain + # gml:RectifiedGrid + # gml:origin + # gml:Point + # gml:pos + + # Find offsetVector elements in the file metadata + # These elements include field #text which we are searching for + + gml_offsetVector_1 = findgmlkey(gml_json, '#text', 0) + gml_offsetVector_2 = findgmlkey(gml_json, '#text', 1) + + # Check whether we have gml:pos or gml:coordinates element in the file metadata + # gml:coordinates is a deprecated type according to opengis.net + try: + gml_pos = findgmlkey(gml_json, 'gml:pos', 0) + except: + gml_pos = findgmlkey(gml_json, 'gml:coordinates', 0) + + # Convert gml_pos to list type in a case it is string type + if type(gml_pos) is str: + + # Split values, use any other symbol as a separator except for dot, minus prefix and numbers. + gml_pos = re.split('[^\-^\d^\.]+', gml_pos) + + # Get semi-major axis of the Earth from ESPG metadata + # TODO get this actually from metadata! + #try: + # Try to get the value + # Fallback value + #except: + earth_axis_semimajor = 6378137 + + # Estimated meters for one degree on Earth surface for used ellipsoid model + dec_mult = float((2 * math.pi * earth_axis_semimajor) / 360) + + # Declare a new list 'l' + l = [] + for d in (gml_offsetVector_1, gml_offsetVector_2): + if type(d) is str: + d = re.split('[^\-^\d^\.]+', d) + + # Add extracted value to list 'l' + l += d + + # Assumed length of list gml_pos is either 4 (gml:pos) or 6 (gml:coordinates). + # We must treat these list types differently. + # In a case length is either of those, return error. + # + # Map correct gml_pos values into new array 'g' + # + g = [0] * 4 + if len(l) == 4: + g[0] = l[0] + g[1] = l[1] + g[2] = l[2] + g[3] = l[3] + + elif len(l) == 6: + g[0] = l[3] + g[1] = l[4] # TODO is this correct index? + g[2] = l[2] # TODO is this correct index? + g[3] = l[1] + + else: + raise ValueError("Error: Incorrect worldfile metadata definition for rotational and pixel size values") + + # World file definition + # https://en.wikipedia.org/wiki/World_file + # + # g[0] = pixel size of X-axis in map units + # g[1] = Y-axis rotation + # g[2] = X-axis rotation + # g[3] = pixel size of Y-axis in map units + # gml_pos[0] = X-coordinate of the center of the upper left pixel + # gml_pos[1] = Y-coordinate of the center of the upper left pixel + + # TODO should gml_pos[1] value be decreased by -1? + +gml_posinfo = GML_Pos_offsetVectors() + +################################################################ +# +# ESPG INFORMATION RETRIEVAL + +class ESPGRetrieval(): + + #def __init__(self): + #try: + espg_number = int(findgmlkey(gml_json, '@srsName', 0).split(':')[-1]) + #except: + #Warn("Warning: Not a valid ESPG number found") + #return + espg_file = str(espg_number) + '.xml' + + def ESPG_retrieve(): + if not os.path.isfile('./' + espg_file): + + # ESPG XML data URL + espg_url = 'http://epsg.io/' + espg_file + urlreq = URL.Request( + espg_url, + data = None, + headers={ + 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 #(KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36' + } + ) + + # Try to download the XML file and save it + with open(espg_file, 'w') as espg_of: + try: + espg_of.write(str(URL.urlopen(urlreq).read().decode('utf-8'))) + except: + Warn("Warning: Could not download ESPG metadata") + + espg_of.close() + + @staticmethod + def ESPG_read(): + with open(espg_file, 'r') as espg_rf: + espg_metadata_list = espg_rf.read() + + espgparser = GMLDataParser(espg_metadata_list) + espg_json = espgparser.jsonpretty() + + gml_datum = findgmlkey(espg_json, 'gml:datumName', 0) + gml_ellipsoid = findgmlkey(espg_json, 'gml:ellipsoidName', 0) + gml_coordsys = findgmlkey(espg_json, 'gml:srsName', 0) + + gml_axis_1_abbrev = findgmlkey(espg_json, 'gml:axisAbbrev', 0) + gml_axis_1_dir = findgmlkey(espg_json, 'gml:axisDirection', 0).capitalize() + + gml_axis_2_abbrev = findgmlkey(espg_json, 'gml:axisAbbrev', 1) + gml_axis_2_dir = findgmlkey(espg_json, 'gml:axisDirection', 1).capitalize() + + # TODO. Have child element #text which contains the actual value + gml_semimajor_axis = findgmlkey(espg_json, 'gml:semiMajorAxis', 0) + gml_inverse_flat = findgmlkey(espg_json, 'gml:inverseFlattening', 0) + +#espg_data = ESPGRetrieval() +#espg_data.ESPG_read() +#sys.exit() + +################################################################ +# +# PHYSICAL AREA SIZE CALCULATOR + +def axisCalculator(): + + # Axis-based data + try: + x_high = float(findgmlkey(gml_json, 'gml:upperCorner', 0).split()[0]) + x_low = float(findgmlkey(gml_json, 'gml:lowerCorner', 0).split()[0]) + y_high = float(findgmlkey(gml_json, 'gml:upperCorner', 0).split()[1]) + y_low = float(findgmlkey(gml_json, 'gml:lowerCorner', 0).split()[1]) + + except: + + # Pixel-based data + try: + x_high = float(findgmlkey(gml_json, 'gml:high', 0).split()[0]) + x_low = float(findgmlkey(gml_json, 'gml:low', 0).split()[0]) + y_high = float(findgmlkey(gml_json, 'gml:high', 0).split()[1]) + y_low = float(findgmlkey(gml_json, 'gml:low', 0).split()[1]) + + except: + x_high = "Unknown" + x_low = "Unknown" + y_high = "Unknown" + y_low = "Unknown" + + for t in (x_high, x_low, y_high, y_low): + if type(t) is not float: + return list([ + 'Unknown', + 'Unknown', + 'Unknown', + 'Unknown', + 'Unknown' + ]) + + def RadtoGrad(num): + rad_to_deg = 180 * num / math.pi + deg_to_grad = 10 * rad_to_deg / 9 + return deg_to_grad + +############################### +# X and Y lengths +# Area size in km^2 + + x_length = x_high - x_low + y_length = y_high - y_low + xy_area = (x_length * y_length) / 1000000 + +############################### +# Inverse geodetic calculation + + xy_hypotenuse = math.sqrt(x_length ** 2 + y_length ** 2) + inverse_geod_angle = RadtoGrad(math.atan2(y_length, x_length)) + +############################### + + return list([ + format(x_length, '.2f'), + format(y_length, '.2f'), + format(xy_area, '.2f'), + format(xy_hypotenuse, '.2f'), + format(inverse_geod_angle, '.2f') + ]) + +gml_calc = axisCalculator() + +################################################################ +# +# TFW FORMAT PARSE + +def tfwparse(): + + worldfile_values = gml_posinfo.g + gml_posinfo.gml_pos + + worldfile_out = '' + for value in worldfile_values: + worldfile_out += format(float(value)) + '\n' + + # Return gml_out, remove last empty line + return worldfile_out[:-1] + +################################################################ +# +# INFORMATION PARSE + +# Extract all important metadata elements + +def infoparse(): + + def getkeys(): + + # TODO these might or might not be defined in JSON data! + infolist = [ + ['Image Name', args.inputfile.split('.')[0] ], + ['Source Name', findgmlkey(gml_json, '@srsName', 0) ], + ['GML File Name', findgmlkey(gml_json, 'gml:fileName', 0) ], + ['File Structure', findgmlkey(gml_json, 'gml:fileStructure', 0) ], + ['Rectified Grid Coverage ID', findgmlkey(gml_json, '@dimension', 0) ], + #['Axis Names', ' '.join(findgmlkey('gml:axisName', 0)) ], + ['Map Scale', ], + ['Upper Corner Coordinates', findgmlkey(gml_json, 'gml:upperCorner', 0) ], + ['Lower Corner Coordinates', findgmlkey(gml_json, 'gml:lowerCorner', 0) ], + ['X-axis Length in Meters', gml_calc[0] ], + ['Y-axis Length in Meters', gml_calc[1] ], + ['Area Size in Square Kilometers', gml_calc[2] ], + ['Distance of Corners Points in Meters', gml_calc[3] ], + ['Azimuth Angle of Corner Points in Gradians', gml_calc[4] ], + ['Grid Envelope High', findgmlkey(gml_json, 'gml:high', 0) ], + ['Grid Envelope Low', findgmlkey(gml_json, 'gml:low', 0) ], + ['X-axis Pixel Size in Map Units', gml_posinfo.g[0] ], + ['Y-axis pixel size in Map Units', gml_posinfo.g[3] ], + ['X-axis Rotation', gml_posinfo.g[1] ], + ['Y-axis Rotation', gml_posinfo.g[2] ], + ['Upper Left Pixel X-coordinate Center in Map Units', gml_posinfo.gml_pos[0] ], + ['Upper Left Pixel Y-coordinate Center in Map Units', gml_posinfo.gml_pos[1] ] + #['EPSG Projection Code', + #['Projection Name', + #['Projection Area', + #['Image Area', + ] + + #row_format ="{:>15}" * (len(teams_list) + 1) + #print(row_format.format("", *teams_list)) + #for team, row in zip(teams_list, data): + # print row_format.format(team, *row) + + #for i in infolist: + # print(i) + + #sys.exit() + + for i in range(len(infolist)): + for j in range(len(infolist[i])): + print(infolist[i][j], end=' ') + print('') + + getkeys() + + #print(gml_source) + +################################ +# +# OUTPUT WRITING + +try: + args.outputfile + + with open(args.outputfile, 'w') as o: + + if args.outputformat in 'xml': + if args.formatting in 'pretty': + o.write(gmlparser.xmlpretty()) + elif args.formatting in 'raw': + o.write(gmlparser.jsonraw()) + else: + raise ValueError("Error: Undefined formatting") + + elif args.outputformat in 'json': + if args.formatting in 'pretty': + o.write(gmlparser.jsonpretty()) + elif args.formatting in 'raw': + o.write(gmlparser.jsonraw()) + else: + raise ValueError("Error: invalid data formatting") + + elif args.outputformat in 'tfw' or args.outputformat in 'worldfile': + o.write(tfwparse()) + + elif args.outputformat in 'info': + o.write(infoparse()) + else: + raise ValueError("Error: invalid data format") + +except: + + if args.outputformat in 'xml': + if args.formatting in 'pretty': + print(gmlparser.xmlpretty()) + elif args.formatting in 'raw': + print(gmlparser.xmlraw()) + else: + raise ValueError("Error: Undefined formatting") + + elif args.outputformat in 'json': + if args.formatting in 'pretty': + print(gmlparser.jsonpretty()) + elif args.formatting in 'raw': + print(gmlparser.jsonraw()) + else: + raise ValueError("Error: Undefined formatting") + + elif args.outputformat in 'tfw' or args.outputformat in 'worldfile': + print(tfwparse()) + + elif args.outputformat in 'info': + print(infoparse()) + else: + raise ValueError("Error: invalid data format") diff --git a/images/sample_json.png b/images/sample_json.png new file mode 100644 index 0000000..3141e9e Binary files /dev/null and b/images/sample_json.png differ diff --git a/images/sample_tfw_info.png b/images/sample_tfw_info.png new file mode 100644 index 0000000..79cfd82 Binary files /dev/null and b/images/sample_tfw_info.png differ diff --git a/images/sample_xml.png b/images/sample_xml.png new file mode 100644 index 0000000..e4bccd9 Binary files /dev/null and b/images/sample_xml.png differ