#!/usr/bin/env python3 # Simple JPEG2000 GML data parser # Copyright (C) 2019 Pekka Helenius # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see . ################################################################ import sys from warnings import warn as Warn import os.path import argparse import re import xmltodict import json import urllib.request as URL import math # TODO import csv # TODO retrieve place name by using metadata coordinates & ref.system info # requires internet connection and connection to a valid server # # TODO rename JPEG2000 file based on metadata entries, syntax given by user # # TODO fix tfw export for JPEG2000 files ################################################################ # # INPUT ARGUMENTS argparser = argparse.ArgumentParser() argparser.add_argument('-i', '--input', help = 'Input JPEG2000 image file', nargs = '?', dest = 'inputfile') argparser.add_argument('-f', '--dataformat', help = 'Output format (Default: xml; Available: xml | json | [tfw|worldfile] | info)', nargs = '?', dest = 'outputformat') argparser.add_argument('-o', '--output', help = 'Output file name', nargs = '?', dest = 'outputfile') argparser.add_argument('-l', '--formatting', help = 'Data formatting (Default: raw; Available: raw, pretty)', nargs = '?', dest = 'formatting') args = argparser.parse_args() # Formatting defaults to pretty format # if args.formatting is None: args.formatting = 'pretty' ################################################################ if not len(sys.argv) > 1: argparser.print_help() exit(0) if not args.inputfile.endswith('.jp2'): Warn("Warning: Not a valid JPEG2000 file suffix") if args.outputformat is None: raise ValueError("Error: No output format or file specified") elif args.outputformat not in ('json', 'xml', 'tfw', 'worldfile', 'info'): raise ValueError("Error: Not a valid output format") ################################################################ # # JPEG2000 CHECK STRINGS # Look for these jp2 strings in file header # jp2_header_str = ['ftypjp2', 'jp2 jpx', 'jp2', 'jp2h', 'jp2 J2P1'] # First string to look for in the selected jp2 file # Basically, we are looking for the start of footer of the file # which is indicated by this string # # We need to convert the string into bytes with encode method # for the following while loop. # mdata_start_str = [ str.encode('gml.data'), str.encode('gxml'), str.encode('fxml') ] mdata_end_str = str.encode('uuid') ################################################################ # # OPEN JPEG2000 file # Open the image file in read-only binary mode with open(args.inputfile, 'rb') as f: ################################################################ # # JPEG2000 header check # Check for the first 4 file lines header_lines = f.readlines()[0:3] # Declare a variable to store header text string header_str = '' # For each header line 1-4... for a in header_lines: # Decode binary formatted string (bytes -> string conversion) and ignore any errors we encounter header_decode = a.decode('utf-8', errors='ignore') # Store decoded string into header_str variable header_str += header_decode # Check existence of each jp2 specific string we have defined in 'jp2_header_str' list above for jp2_str in jp2_header_str: # If a jp2 specific string is found, set up a new variable t and break the for loop if jp2_str in header_str: t = '' break # Variable t is not defined if any valid jp2 string is not found. Thus, this test gives # us and exception (NameError) if no any jp2 string is found. try: t except NameError: raise ValueError("Error: Not a valid JPEG2000 file") ################################################################ # # PARSE METADATA LINES # Enumerate all lines, look for metadata start line, using # string 'mdata_start_str' as a reference # Break the loop when found. If not found, abort. # # Return to the first line again in order to parse footer lines f.seek(0) for mstart_num, mstart_line in enumerate(f): # TODO better formatting for this if statement: if mdata_start_str[0] in mstart_line or mdata_start_str[1] in mstart_line or mdata_start_str[2] in mstart_line: break # else # TODO echo cannot found metadata start. Abort # TODO should return value 2 #print(mstart_num) #sys.exit() # Enumerate all lines, look for metadata end line, using # string 'mdata_end_str' as a reference # Break the loop when found. If not found, abort. # f.seek(0) for mend_num, mend_line in enumerate(f): if mdata_end_str in mend_line: break # else # TODO echo cannot found metadata end. Abort # Reset readlines # # Convert metadata start line from 'str' type to 'list' with split method # and merge it with the rest of the metadata lines, defined by readlines method. # Type of this line list is 'list', thus we use + operator to combine these # lists. # f.seek(0) metadata_lines = mstart_line.split() + f.readlines()[mstart_num:mend_num] #mdata_lines = mstart_line.split() + f.readlines() # Create a new metadata_str variable where we will store our extracted footer strings. metadata_str = '' for byteline in metadata_lines: # Try decode each metadata line to UTF-8 format. # As these lines are binary code, the conversion will fail for some # of them. In a case of failure, we let the for loop pass to the next # line # # Add each decoded line into 'footer_str' variable # try: byteline_decoded = byteline.decode('utf-8', errors='strict') metadata_str += byteline_decoded except Exception: pass f.close() metadata_xml_all = re.sub(r'(^[^<]*)|([^>]$)', '', metadata_str) # Create a list element from extracted metadata strings metadata_xml_all_list = metadata_xml_all.split() # Find the last element containing <> symbols in metadata_xml_list, # get the string between them and store it to new variable for i in reversed(metadata_xml_all_list): if re.match('', i): last_tag = re.sub('', '', i) break # In the original metadata list, find the first occurence of the 'last_tag' for firstxml_index, value in enumerate(metadata_xml_all_list): if re.match('<' + last_tag + '>?', value): break # For joined metadata list, delete all list entries presented before our 'last_tag' # Convert list to string format metadata_parsed_list = metadata_xml_all_list[firstxml_index:] metadata_joined_list = ' '.join(metadata_parsed_list) ################################################################ # class GMLDataParser(object): def __init__(self, datalist): self.datalist = datalist def xmlraw(self): return xmltodict.parse(self.datalist) def xmlpretty(self): return xmltodict.unparse(xmltodict.parse(self.datalist), pretty=True,indent=" ",newl="\n") def jsonraw(self): return json.dumps(xmltodict.parse(self.datalist), separators=(',', ':')) def jsonpretty(self): return json.dumps(xmltodict.parse(self.datalist), indent=2, sort_keys=True) # Convert GML metadata to JSON tree object def jsontree(self): return json.loads(self.jsonpretty()) # Function to get nested key values from JSON data # by arainchi # https://stackoverflow.com/a/19871956 def findkey(self, tree, keyvalue): if isinstance(tree, list): for i in tree: for x in self.findkey(i, keyvalue): yield x elif isinstance(tree, dict): if keyvalue in tree: yield tree[keyvalue] for j in tree.values(): for x in self.findkey(j, keyvalue): yield x gmlparser = GMLDataParser(metadata_joined_list) gml_json = gmlparser.jsontree() def findgmlkey(data, gmlkey, num): try: return list(gmlparser.findkey(data, gmlkey))[num] except: # In a case we can't parse GML data for this element, return string 'Unknown' return str("Unknown") ################################################################ # # Extract relevant values for TFW file/Worldfile class GML_Pos_offsetVectors(): # Sample metadata structure of JPEG2000 files (may differ!): # offsetVector_1 and offsetVector_2: # # gml:FeatureCollection # gml:featureMember # gml:FeatureCollection # gml:featureMember # gml:RectifiedGridCoverage # gml:rectifiedGridDomain # gml:RectifiedGrid # gml:offsetVector[0] # #text # gml:offsetVector[1] # #text # # gml_pos: # # gml:FeatureCollection # gml:featureMember # gml:FeatureCollection # gml:featureMember # gml:RectifiedGridCoverage # gml:rectifiedGridDomain # gml:RectifiedGrid # gml:origin # gml:Point # gml:pos # Find offsetVector elements in the file metadata # These elements include field #text which we are searching for gml_offsetVector_1 = findgmlkey(gml_json, '#text', 0) gml_offsetVector_2 = findgmlkey(gml_json, '#text', 1) # Check whether we have gml:pos or gml:coordinates element in the file metadata # gml:coordinates is a deprecated type according to opengis.net try: gml_pos = findgmlkey(gml_json, 'gml:pos', 0) except: gml_pos = findgmlkey(gml_json, 'gml:coordinates', 0) # Convert gml_pos to list type in a case it is string type if type(gml_pos) is str: # Split values, use any other symbol as a separator except for dot, minus prefix and numbers. gml_pos = re.split('[^\-^\d^\.]+', gml_pos) # Get semi-major axis of the Earth from ESPG metadata # TODO get this actually from metadata! #try: # Try to get the value # Fallback value #except: earth_axis_semimajor = 6378137 # Estimated meters for one degree on Earth surface for used ellipsoid model dec_mult = float((2 * math.pi * earth_axis_semimajor) / 360) # Declare a new list 'l' l = [] for d in (gml_offsetVector_1, gml_offsetVector_2): if type(d) is str: d = re.split('[^\-^\d^\.]+', d) # Add extracted value to list 'l' l += d # Assumed length of list gml_pos is either 4 (gml:pos) or 6 (gml:coordinates). # We must treat these list types differently. # In a case length is either of those, return error. # # Map correct gml_pos values into new array 'g' # g = [0] * 4 if len(l) == 4: g[0] = l[0] g[1] = l[1] g[2] = l[2] g[3] = l[3] elif len(l) == 6: g[0] = l[3] g[1] = l[4] # TODO is this correct index? g[2] = l[2] # TODO is this correct index? g[3] = l[1] else: raise ValueError("Error: Incorrect worldfile metadata definition for rotational and pixel size values") # World file definition # https://en.wikipedia.org/wiki/World_file # # g[0] = pixel size of X-axis in map units # g[1] = Y-axis rotation # g[2] = X-axis rotation # g[3] = pixel size of Y-axis in map units # gml_pos[0] = X-coordinate of the center of the upper left pixel # gml_pos[1] = Y-coordinate of the center of the upper left pixel # TODO should gml_pos[1] value be decreased by -1? gml_posinfo = GML_Pos_offsetVectors() ################################################################ # # ESPG INFORMATION RETRIEVAL class ESPGRetrieval(): #def __init__(self): #try: espg_number = int(findgmlkey(gml_json, '@srsName', 0).split(':')[-1]) #except: #Warn("Warning: Not a valid ESPG number found") #return espg_file = str(espg_number) + '.xml' def ESPG_retrieve(): if not os.path.isfile('./' + espg_file): # ESPG XML data URL espg_url = 'http://epsg.io/' + espg_file urlreq = URL.Request( espg_url, data = None, headers={ 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 #(KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36' } ) # Try to download the XML file and save it with open(espg_file, 'w') as espg_of: try: espg_of.write(str(URL.urlopen(urlreq).read().decode('utf-8'))) except: Warn("Warning: Could not download ESPG metadata") espg_of.close() @staticmethod def ESPG_read(): with open(espg_file, 'r') as espg_rf: espg_metadata_list = espg_rf.read() espgparser = GMLDataParser(espg_metadata_list) espg_json = espgparser.jsonpretty() gml_datum = findgmlkey(espg_json, 'gml:datumName', 0) gml_ellipsoid = findgmlkey(espg_json, 'gml:ellipsoidName', 0) gml_coordsys = findgmlkey(espg_json, 'gml:srsName', 0) gml_axis_1_abbrev = findgmlkey(espg_json, 'gml:axisAbbrev', 0) gml_axis_1_dir = findgmlkey(espg_json, 'gml:axisDirection', 0).capitalize() gml_axis_2_abbrev = findgmlkey(espg_json, 'gml:axisAbbrev', 1) gml_axis_2_dir = findgmlkey(espg_json, 'gml:axisDirection', 1).capitalize() # TODO. Have child element #text which contains the actual value gml_semimajor_axis = findgmlkey(espg_json, 'gml:semiMajorAxis', 0) gml_inverse_flat = findgmlkey(espg_json, 'gml:inverseFlattening', 0) #espg_data = ESPGRetrieval() #espg_data.ESPG_read() #sys.exit() ################################################################ # # PHYSICAL AREA SIZE CALCULATOR def axisCalculator(): # Axis-based data try: x_high = float(findgmlkey(gml_json, 'gml:upperCorner', 0).split()[0]) x_low = float(findgmlkey(gml_json, 'gml:lowerCorner', 0).split()[0]) y_high = float(findgmlkey(gml_json, 'gml:upperCorner', 0).split()[1]) y_low = float(findgmlkey(gml_json, 'gml:lowerCorner', 0).split()[1]) except: # Pixel-based data try: x_high = float(findgmlkey(gml_json, 'gml:high', 0).split()[0]) x_low = float(findgmlkey(gml_json, 'gml:low', 0).split()[0]) y_high = float(findgmlkey(gml_json, 'gml:high', 0).split()[1]) y_low = float(findgmlkey(gml_json, 'gml:low', 0).split()[1]) except: x_high = "Unknown" x_low = "Unknown" y_high = "Unknown" y_low = "Unknown" for t in (x_high, x_low, y_high, y_low): if type(t) is not float: return list([ 'Unknown', 'Unknown', 'Unknown', 'Unknown', 'Unknown' ]) def RadtoGrad(num): rad_to_deg = 180 * num / math.pi deg_to_grad = 10 * rad_to_deg / 9 return deg_to_grad ############################### # X and Y lengths # Area size in km^2 x_length = x_high - x_low y_length = y_high - y_low xy_area = (x_length * y_length) / 1000000 ############################### # Inverse geodetic calculation xy_hypotenuse = math.sqrt(x_length ** 2 + y_length ** 2) inverse_geod_angle = RadtoGrad(math.atan2(y_length, x_length)) ############################### return list([ format(x_length, '.2f'), format(y_length, '.2f'), format(xy_area, '.2f'), format(xy_hypotenuse, '.2f'), format(inverse_geod_angle, '.2f') ]) gml_calc = axisCalculator() ################################################################ # # TFW FORMAT PARSE def tfwparse(): worldfile_values = gml_posinfo.g + gml_posinfo.gml_pos worldfile_out = '' for value in worldfile_values: worldfile_out += format(float(value)) + '\n' # Return gml_out, remove last empty line return worldfile_out[:-1] ################################################################ # # INFORMATION PARSE # Extract all important metadata elements def infoparse(): def getkeys(): # TODO these might or might not be defined in JSON data! infolist = [ ['Image Name', args.inputfile.split('.')[0] ], ['Source Name', findgmlkey(gml_json, '@srsName', 0) ], ['GML File Name', findgmlkey(gml_json, 'gml:fileName', 0) ], ['File Structure', findgmlkey(gml_json, 'gml:fileStructure', 0) ], ['Rectified Grid Coverage ID', findgmlkey(gml_json, '@dimension', 0) ], #['Axis Names', ' '.join(findgmlkey('gml:axisName', 0)) ], ['Map Scale', ], ['Upper Corner Coordinates', findgmlkey(gml_json, 'gml:upperCorner', 0) ], ['Lower Corner Coordinates', findgmlkey(gml_json, 'gml:lowerCorner', 0) ], ['X-axis Length in Meters', gml_calc[0] ], ['Y-axis Length in Meters', gml_calc[1] ], ['Area Size in Square Kilometers', gml_calc[2] ], ['Distance of Corners Points in Meters', gml_calc[3] ], ['Azimuth Angle of Corner Points in Gradians', gml_calc[4] ], ['Grid Envelope High', findgmlkey(gml_json, 'gml:high', 0) ], ['Grid Envelope Low', findgmlkey(gml_json, 'gml:low', 0) ], ['X-axis Pixel Size in Map Units', gml_posinfo.g[0] ], ['Y-axis pixel size in Map Units', gml_posinfo.g[3] ], ['X-axis Rotation', gml_posinfo.g[1] ], ['Y-axis Rotation', gml_posinfo.g[2] ], ['Upper Left Pixel X-coordinate Center in Map Units', gml_posinfo.gml_pos[0] ], ['Upper Left Pixel Y-coordinate Center in Map Units', gml_posinfo.gml_pos[1] ] #['EPSG Projection Code', #['Projection Name', #['Projection Area', #['Image Area', ] #row_format ="{:>15}" * (len(teams_list) + 1) #print(row_format.format("", *teams_list)) #for team, row in zip(teams_list, data): # print row_format.format(team, *row) #for i in infolist: # print(i) #sys.exit() for i in range(len(infolist)): for j in range(len(infolist[i])): print(infolist[i][j], end=' ') print('') getkeys() #print(gml_source) ################################ # # OUTPUT WRITING try: args.outputfile with open(args.outputfile, 'w') as o: if args.outputformat in 'xml': if args.formatting in 'pretty': o.write(gmlparser.xmlpretty()) elif args.formatting in 'raw': o.write(gmlparser.jsonraw()) else: raise ValueError("Error: Undefined formatting") elif args.outputformat in 'json': if args.formatting in 'pretty': o.write(gmlparser.jsonpretty()) elif args.formatting in 'raw': o.write(gmlparser.jsonraw()) else: raise ValueError("Error: invalid data formatting") elif args.outputformat in 'tfw' or args.outputformat in 'worldfile': o.write(tfwparse()) elif args.outputformat in 'info': o.write(infoparse()) else: raise ValueError("Error: invalid data format") except: if args.outputformat in 'xml': if args.formatting in 'pretty': print(gmlparser.xmlpretty()) elif args.formatting in 'raw': print(gmlparser.xmlraw()) else: raise ValueError("Error: Undefined formatting") elif args.outputformat in 'json': if args.formatting in 'pretty': print(gmlparser.jsonpretty()) elif args.formatting in 'raw': print(gmlparser.jsonraw()) else: raise ValueError("Error: Undefined formatting") elif args.outputformat in 'tfw' or args.outputformat in 'worldfile': print(tfwparse()) elif args.outputformat in 'info': print(infoparse()) else: raise ValueError("Error: invalid data format")