Fincer
/
gmlparser
mirror of https://github.com/Fincer/gmlparser


#!/usr/bin/env python3


#    Simple JPEG2000 GML data parser

#    Copyright (C) 2019  Pekka Helenius

#

#    This program is free software: you can redistribute it and/or modify

#    it under the terms of the GNU General Public License as published by

#    the Free Software Foundation, either version 3 of the License, or

#    (at your option) any later version.

#

#    This program is distributed in the hope that it will be useful,

#    but WITHOUT ANY WARRANTY; without even the implied warranty of

#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the

#    GNU General Public License for more details.

#

#    You should have received a copy of the GNU General Public License

#    along with this program.  If not, see <https://www.gnu.org/licenses/>.


################################################################


import sys

from warnings import warn as Warn

import os.path

import argparse

import re

import xmltodict

import json

import urllib.request as URL

import math

# TODO import csv


# TODO retrieve place name by using metadata coordinates & ref.system info

#   requires internet connection and connection to a valid server

#

# TODO rename JPEG2000 file based on metadata entries, syntax given by user

#

# TODO fix tfw export for JPEG2000 files


################################################################

#

# INPUT ARGUMENTS


argparser = argparse.ArgumentParser()


argparser.add_argument('-i', '--input', help = 'Input JPEG2000 image file', nargs = '?', dest = 'inputfile')

argparser.add_argument('-f', '--dataformat', help = 'Output format (Default: xml; Available: xml | json | [tfw|worldfile] | info)', nargs = '?', dest = 'outputformat')

argparser.add_argument('-o', '--output', help = 'Output file name', nargs = '?', dest = 'outputfile')

argparser.add_argument('-l', '--formatting', help = 'Data formatting (Default: raw; Available: raw, pretty)', nargs = '?', dest = 'formatting')


args = argparser.parse_args()


# Formatting defaults to pretty format

#

if args.formatting is None:

    args.formatting = 'pretty'


################################################################


if not len(sys.argv) > 1:

    argparser.print_help()

    exit(0)


if not args.inputfile.endswith('.jp2'):

    Warn("Warning: Not a valid JPEG2000 file suffix")


if args.outputformat is None:

    raise ValueError("Error: No output format or file specified")


elif args.outputformat not in ('json', 'xml', 'tfw', 'worldfile', 'info'):

    raise ValueError("Error: Not a valid output format")


################################################################

#

# JPEG2000 CHECK STRINGS


# Look for these jp2 strings in file header

#

jp2_header_str = ['ftypjp2', 'jp2 jpx', 'jp2', 'jp2h', 'jp2 J2P1']


# First string to look for in the selected jp2 file

# Basically, we are looking for the start of footer of the file

# which is indicated by this string

#

# We need to convert the string into bytes with encode method

# for the following while loop.

#

mdata_start_str = [ str.encode('gml.data'), str.encode('gxml'), str.encode('fxml') ]

mdata_end_str = str.encode('uuid')


################################################################

#

# OPEN JPEG2000 file


# Open the image file in read-only binary mode

with open(args.inputfile, 'rb') as f:


################################################################

#

# JPEG2000 header check


    # Check for the first 4 file lines

    header_lines = f.readlines()[0:3]


    # Declare a variable to store header text string

    header_str = ''


    # For each header line 1-4...

    for a in header_lines:


        # Decode binary formatted string (bytes -> string conversion) and ignore any errors we encounter

        header_decode = a.decode('utf-8', errors='ignore')


        # Store decoded string into header_str variable

        header_str += header_decode


    # Check existence of each jp2 specific string we have defined in 'jp2_header_str' list above

    for jp2_str in jp2_header_str:


        # If a jp2 specific string is found, set up a new variable t and break the for loop

        if jp2_str in header_str:

            t = ''

            break


    # Variable t is not defined if any valid jp2 string is not found. Thus, this test gives

    # us and exception (NameError) if no any jp2 string is found.

    try:

        t

    except NameError:

        raise ValueError("Error: Not a valid JPEG2000 file")


################################################################

#

# PARSE METADATA LINES


    # Enumerate all lines, look for metadata start line, using

    # string 'mdata_start_str' as a reference

    # Break the loop when found. If not found, abort.

    #

    # Return to the first line again in order to parse footer lines

    f.seek(0)


    for mstart_num, mstart_line in enumerate(f):

        # TODO better formatting for this if statement:

        if mdata_start_str[0] in mstart_line or mdata_start_str[1] in mstart_line or mdata_start_str[2] in mstart_line:

            break

            # else

                # TODO echo cannot found metadata start. Abort


    # TODO should return value 2

    #print(mstart_num)

    #sys.exit()


    # Enumerate all lines, look for metadata end line, using

    # string 'mdata_end_str' as a reference

    # Break the loop when found. If not found, abort.

    #

    f.seek(0)

    for mend_num, mend_line in enumerate(f):

        if mdata_end_str in mend_line:

            break

        # else

            # TODO echo cannot found metadata end. Abort


    # Reset readlines

    #

    # Convert metadata start line from 'str' type to 'list' with split method

    # and merge it with the rest of the metadata lines, defined by readlines method.

    # Type of this line list is 'list', thus we use + operator to combine these

    # lists.

    #

    f.seek(0)

    metadata_lines = mstart_line.split() + f.readlines()[mstart_num:mend_num]

    #mdata_lines = mstart_line.split() + f.readlines()


    # Create a new metadata_str variable where we will store our extracted footer strings.

    metadata_str = ''

    for byteline in metadata_lines:


        # Try decode each metadata line to UTF-8 format.

        # As these lines are binary code, the conversion will fail for some

        # of them. In a case of failure, we let the for loop pass to the next

        # line

        #

        # Add each decoded line into 'footer_str' variable

        #

        try:

            byteline_decoded = byteline.decode('utf-8', errors='strict')

            metadata_str += byteline_decoded


        except Exception:

            pass


    f.close()

    metadata_xml_all = re.sub(r'(^[^<]*)|([^>]$)', '', metadata_str)


    # Create a list element from extracted metadata strings

    metadata_xml_all_list = metadata_xml_all.split()


    # Find the last element containing <> symbols in metadata_xml_list,

    # get the string between them and store it to new variable

    for i in reversed(metadata_xml_all_list):

        if re.match('</*.*>', i):

            last_tag = re.sub('</|>', '', i)

            break


    # In the original metadata list, find the first occurence of the 'last_tag'

    for firstxml_index, value in enumerate(metadata_xml_all_list):

        if re.match('<' + last_tag + '>?', value):

            break


    # For joined metadata list, delete all list entries presented before our 'last_tag'

    # Convert list to string format

    metadata_parsed_list = metadata_xml_all_list[firstxml_index:]

    metadata_joined_list = ' '.join(metadata_parsed_list)


################################################################

#


class GMLDataParser(object):


    def __init__(self, datalist):

        self.datalist = datalist


    def xmlraw(self):

        return xmltodict.parse(self.datalist)


    def xmlpretty(self):

        return xmltodict.unparse(xmltodict.parse(self.datalist),

        pretty=True,indent="  ",newl="\n")


    def jsonraw(self):

        return json.dumps(xmltodict.parse(self.datalist),

        separators=(',', ':'))


    def jsonpretty(self):

        return json.dumps(xmltodict.parse(self.datalist),

        indent=2, sort_keys=True)


    # Convert GML metadata to JSON tree object

    def jsontree(self):

        return json.loads(self.jsonpretty())


    # Function to get nested key values from JSON data

    # by arainchi

    # https://stackoverflow.com/a/19871956

    def findkey(self, tree, keyvalue):

        if isinstance(tree, list):

            for i in tree:

                for x in self.findkey(i, keyvalue):

                    yield x

        elif isinstance(tree, dict):

            if keyvalue in tree:

                yield tree[keyvalue]

            for j in tree.values():

                for x in self.findkey(j, keyvalue):

                    yield x


gmlparser = GMLDataParser(metadata_joined_list)

gml_json = gmlparser.jsontree()


def findgmlkey(data, gmlkey, num):

    try:

        return list(gmlparser.findkey(data, gmlkey))[num]

    except:

        # In a case we can't parse GML data for this element, return string 'Unknown'

        return str("Unknown")


################################################################

#

# Extract relevant values for TFW file/Worldfile


class GML_Pos_offsetVectors():


    # Sample metadata structure of JPEG2000 files (may differ!):


    # offsetVector_1 and offsetVector_2:

    #

    # gml:FeatureCollection

    #     gml:featureMember

    #         gml:FeatureCollection

    #             gml:featureMember

    #                 gml:RectifiedGridCoverage

    #                     gml:rectifiedGridDomain

    #                         gml:RectifiedGrid

    #                             gml:offsetVector[0]

    #                                 #text

    #                             gml:offsetVector[1]

    #                                 #text

    #

    # gml_pos:

    #

    # gml:FeatureCollection

    #     gml:featureMember

    #         gml:FeatureCollection

    #             gml:featureMember

    #                 gml:RectifiedGridCoverage

    #                     gml:rectifiedGridDomain

    #                         gml:RectifiedGrid

    #                             gml:origin

    #                                 gml:Point

    #                                     gml:pos


    # Find offsetVector elements in the file metadata

    # These elements include field #text which we are searching for


    gml_offsetVector_1 = findgmlkey(gml_json, '#text', 0)

    gml_offsetVector_2 = findgmlkey(gml_json, '#text', 1)


    # Check whether we have gml:pos or gml:coordinates element in the file metadata

    # gml:coordinates is a deprecated type according to opengis.net

    try:

        gml_pos = findgmlkey(gml_json, 'gml:pos', 0)

    except:

        gml_pos = findgmlkey(gml_json, 'gml:coordinates', 0)


    # Convert gml_pos to list type in a case it is string type

    if type(gml_pos) is str:


        # Split values, use any other symbol as a separator except for dot, minus prefix and numbers.

        gml_pos = re.split('[^\-^\d^\.]+', gml_pos)


        # Get semi-major axis of the Earth from ESPG metadata

        # TODO get this actually from metadata!

        #try:

            # Try to get the value

        # Fallback value

        #except:

        earth_axis_semimajor = 6378137


        # Estimated meters for one degree on Earth surface for used ellipsoid model

        dec_mult = float((2 * math.pi * earth_axis_semimajor) / 360)


    # Declare a new list 'l'

    l = []

    for d in (gml_offsetVector_1, gml_offsetVector_2):

        if type(d) is str:

            d = re.split('[^\-^\d^\.]+', d)


        # Add extracted value to list 'l'

        l += d


    # Assumed length of list gml_pos is either 4 (gml:pos) or 6 (gml:coordinates).

    # We must treat these list types differently.

    # In a case length is either of those, return error.

    #

    # Map correct gml_pos values into new array 'g'

    #

    g = [0] * 4

    if len(l) == 4:

        g[0] = l[0]

        g[1] = l[1]

        g[2] = l[2]

        g[3] = l[3]


    elif len(l) == 6:

        g[0] = l[3]

        g[1] = l[4] # TODO is this correct index?

        g[2] = l[2] # TODO is this correct index?

        g[3] = l[1]


    else:

        raise ValueError("Error: Incorrect worldfile metadata definition for rotational and pixel size values")


    # World file definition

    # https://en.wikipedia.org/wiki/World_file

    #

    # g[0]          = pixel size of X-axis in map units

    # g[1]          = Y-axis rotation

    # g[2]          = X-axis rotation

    # g[3]          = pixel size of Y-axis in map units

    # gml_pos[0]    = X-coordinate of the center of the upper left pixel

    # gml_pos[1]    = Y-coordinate of the center of the upper left pixel


    # TODO should gml_pos[1] value be decreased by -1?


gml_posinfo = GML_Pos_offsetVectors()


################################################################

#

# ESPG INFORMATION RETRIEVAL


class ESPGRetrieval():


    #def __init__(self):

    #try:

    espg_number = int(findgmlkey(gml_json, '@srsName', 0).split(':')[-1])

    #except:

        #Warn("Warning: Not a valid ESPG number found")

        #return

    espg_file = str(espg_number) + '.xml'


    def ESPG_retrieve():

        if not os.path.isfile('./' + espg_file):


            # ESPG XML data URL

            espg_url = 'http://epsg.io/' + espg_file

            urlreq = URL.Request(

                espg_url,

                data = None,

                headers={

                    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 #(KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'

                }

            )


            # Try to download the XML file and save it

            with open(espg_file, 'w') as espg_of:

                try:

                    espg_of.write(str(URL.urlopen(urlreq).read().decode('utf-8')))

                except:

                    Warn("Warning: Could not download ESPG metadata")


                espg_of.close()


    @staticmethod

    def ESPG_read():

        with open(espg_file, 'r') as espg_rf:

            espg_metadata_list = espg_rf.read()


            espgparser = GMLDataParser(espg_metadata_list)

            espg_json = espgparser.jsonpretty()


        gml_datum          = findgmlkey(espg_json, 'gml:datumName', 0)

        gml_ellipsoid      = findgmlkey(espg_json, 'gml:ellipsoidName', 0)

        gml_coordsys       = findgmlkey(espg_json, 'gml:srsName', 0)


        gml_axis_1_abbrev  = findgmlkey(espg_json, 'gml:axisAbbrev', 0)

        gml_axis_1_dir     = findgmlkey(espg_json, 'gml:axisDirection', 0).capitalize()


        gml_axis_2_abbrev  = findgmlkey(espg_json, 'gml:axisAbbrev', 1)

        gml_axis_2_dir     = findgmlkey(espg_json, 'gml:axisDirection', 1).capitalize()


        # TODO. Have child element #text which contains the actual value

        gml_semimajor_axis = findgmlkey(espg_json, 'gml:semiMajorAxis', 0)

        gml_inverse_flat   = findgmlkey(espg_json, 'gml:inverseFlattening', 0)


#espg_data = ESPGRetrieval()

#espg_data.ESPG_read()

#sys.exit()


################################################################

#

# PHYSICAL AREA SIZE CALCULATOR


def axisCalculator():


    # Axis-based data

    try:

        x_high = float(findgmlkey(gml_json, 'gml:upperCorner', 0).split()[0])

        x_low  = float(findgmlkey(gml_json, 'gml:lowerCorner', 0).split()[0])

        y_high = float(findgmlkey(gml_json, 'gml:upperCorner', 0).split()[1])

        y_low  = float(findgmlkey(gml_json, 'gml:lowerCorner', 0).split()[1])


    except:


        # Pixel-based data

        try:

            x_high = float(findgmlkey(gml_json, 'gml:high', 0).split()[0])

            x_low  = float(findgmlkey(gml_json, 'gml:low', 0).split()[0])

            y_high = float(findgmlkey(gml_json, 'gml:high', 0).split()[1])

            y_low  = float(findgmlkey(gml_json, 'gml:low', 0).split()[1])


        except:

            x_high = "Unknown"

            x_low  = "Unknown"

            y_high = "Unknown"

            y_low  = "Unknown"


    for t in (x_high, x_low, y_high, y_low):

        if type(t) is not float:

            return list([

            'Unknown',

            'Unknown',

            'Unknown',

            'Unknown',

            'Unknown'

            ])


    def RadtoGrad(num):

        rad_to_deg = 180 * num / math.pi

        deg_to_grad = 10 * rad_to_deg / 9

        return deg_to_grad


###############################

# X and Y lengths

# Area size in km^2


    x_length = x_high - x_low

    y_length = y_high - y_low

    xy_area  = (x_length * y_length) / 1000000


###############################

# Inverse geodetic calculation


    xy_hypotenuse = math.sqrt(x_length ** 2 + y_length ** 2)

    inverse_geod_angle = RadtoGrad(math.atan2(y_length, x_length))


###############################


    return list([

        format(x_length, '.2f'),

        format(y_length, '.2f'),

        format(xy_area,  '.2f'),

        format(xy_hypotenuse, '.2f'),

        format(inverse_geod_angle, '.2f')

        ])


gml_calc = axisCalculator()


################################################################

#

# TFW FORMAT PARSE


def tfwparse():


    worldfile_values = gml_posinfo.g + gml_posinfo.gml_pos


    worldfile_out = ''

    for value in worldfile_values:

        worldfile_out += format(float(value)) + '\n'


    # Return gml_out, remove last empty line

    return worldfile_out[:-1]


################################################################

#

# INFORMATION PARSE


# Extract all important metadata elements


def infoparse():


    def getkeys():


        # TODO these might or might not be defined in JSON data!

        infolist = [

          ['Image Name',                 args.inputfile.split('.')[0]       ],

          ['Source Name',                findgmlkey(gml_json, '@srsName', 0)          ],

          ['GML File Name',              findgmlkey(gml_json, 'gml:fileName', 0)      ],

          ['File Structure',             findgmlkey(gml_json, 'gml:fileStructure', 0) ],

          ['Rectified Grid Coverage ID', findgmlkey(gml_json, '@dimension', 0)        ],

         #['Axis Names',            ' '.join(findgmlkey('gml:axisName', 0)) ],

          ['Map Scale',                         ],

          ['Upper Corner Coordinates',   findgmlkey(gml_json, 'gml:upperCorner', 0)   ],

          ['Lower Corner Coordinates',   findgmlkey(gml_json, 'gml:lowerCorner', 0)   ],

          ['X-axis Length in Meters',    gml_calc[0]                        ],

          ['Y-axis Length in Meters',    gml_calc[1]                        ],

          ['Area Size in Square Kilometers', gml_calc[2]                    ],

          ['Distance of Corners Points in Meters', gml_calc[3]                     ],

          ['Azimuth Angle of Corner Points in Gradians', gml_calc[4] ],

          ['Grid Envelope High',         findgmlkey(gml_json, 'gml:high', 0)          ],

          ['Grid Envelope Low',          findgmlkey(gml_json, 'gml:low', 0)           ],

          ['X-axis Pixel Size in Map Units', gml_posinfo.g[0]               ],

          ['Y-axis pixel size in Map Units', gml_posinfo.g[3]               ],

          ['X-axis Rotation',           gml_posinfo.g[1]                    ],

          ['Y-axis Rotation',           gml_posinfo.g[2]                    ],

          ['Upper Left Pixel X-coordinate Center in Map Units', gml_posinfo.gml_pos[0]   ],

          ['Upper Left Pixel Y-coordinate Center in Map Units', gml_posinfo.gml_pos[1]   ]

          #['EPSG Projection Code',

          #['Projection Name',

          #['Projection Area',

          #['Image Area',

        ]


        #row_format ="{:>15}" * (len(teams_list) + 1)

        #print(row_format.format("", *teams_list))

        #for team, row in zip(teams_list, data):

         #   print row_format.format(team, *row)


        #for i in infolist:

        #    print(i)


        #sys.exit()


        for i in range(len(infolist)):

            for j in range(len(infolist[i])):

                    print(infolist[i][j], end=' ')

            print('')


    getkeys()


    #print(gml_source)


################################

#

# OUTPUT WRITING


try:

    args.outputfile


    with open(args.outputfile, 'w') as o:


        if args.outputformat in 'xml':

            if args.formatting in 'pretty':

                o.write(gmlparser.xmlpretty())

            elif args.formatting in 'raw':

                o.write(gmlparser.jsonraw())

            else:

                raise ValueError("Error: Undefined formatting")


        elif args.outputformat in 'json':

            if args.formatting in 'pretty':

                o.write(gmlparser.jsonpretty())

            elif args.formatting in 'raw':

                o.write(gmlparser.jsonraw())

            else:

                raise ValueError("Error: invalid data formatting")


        elif args.outputformat in 'tfw' or args.outputformat in 'worldfile':

            o.write(tfwparse())


        elif args.outputformat in 'info':

            o.write(infoparse())

        else:

            raise ValueError("Error: invalid data format")


except:


    if args.outputformat in 'xml':

        if args.formatting in 'pretty':

            print(gmlparser.xmlpretty())

        elif args.formatting in 'raw':

            print(gmlparser.xmlraw())

        else:

            raise ValueError("Error: Undefined formatting")


    elif args.outputformat in 'json':

        if args.formatting in 'pretty':

            print(gmlparser.jsonpretty())

        elif args.formatting in 'raw':

            print(gmlparser.jsonraw())

        else:

            raise ValueError("Error: Undefined formatting")


    elif args.outputformat in 'tfw' or args.outputformat in 'worldfile':

        print(tfwparse())


    elif args.outputformat in 'info':

        print(infoparse())

    else:

								        raise ValueError("Error: invalid data format")