Simple JPEG2000 GML data parser / extractor
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

640 lines
21 KiB

5 years ago
  1. #!/usr/bin/env python3
  2. # Simple JPEG2000 GML data parser
  3. # Copyright (C) 2019 Pekka Helenius
  4. #
  5. # This program is free software: you can redistribute it and/or modify
  6. # it under the terms of the GNU General Public License as published by
  7. # the Free Software Foundation, either version 3 of the License, or
  8. # (at your option) any later version.
  9. #
  10. # This program is distributed in the hope that it will be useful,
  11. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13. # GNU General Public License for more details.
  14. #
  15. # You should have received a copy of the GNU General Public License
  16. # along with this program. If not, see <https://www.gnu.org/licenses/>.
  17. ################################################################
  18. import sys
  19. from warnings import warn as Warn
  20. import os.path
  21. import argparse
  22. import re
  23. import xmltodict
  24. import json
  25. import urllib.request as URL
  26. import math
  27. # TODO import csv
  28. # TODO retrieve place name by using metadata coordinates & ref.system info
  29. # requires internet connection and connection to a valid server
  30. #
  31. # TODO rename JPEG2000 file based on metadata entries, syntax given by user
  32. #
  33. # TODO fix tfw export for JPEG2000 files
  34. ################################################################
  35. #
  36. # INPUT ARGUMENTS
  37. argparser = argparse.ArgumentParser()
  38. argparser.add_argument('-i', '--input', help = 'Input JPEG2000 image file', nargs = '?', dest = 'inputfile')
  39. argparser.add_argument('-f', '--dataformat', help = 'Output format (Default: xml; Available: xml | json | [tfw|worldfile] | info)', nargs = '?', dest = 'outputformat')
  40. argparser.add_argument('-o', '--output', help = 'Output file name', nargs = '?', dest = 'outputfile')
  41. argparser.add_argument('-l', '--formatting', help = 'Data formatting (Default: raw; Available: raw, pretty)', nargs = '?', dest = 'formatting')
  42. args = argparser.parse_args()
  43. # Formatting defaults to pretty format
  44. #
  45. if args.formatting is None:
  46. args.formatting = 'pretty'
  47. ################################################################
  48. if not len(sys.argv) > 1:
  49. argparser.print_help()
  50. exit(0)
  51. if not args.inputfile.endswith('.jp2'):
  52. Warn("Warning: Not a valid JPEG2000 file suffix")
  53. if args.outputformat is None:
  54. raise ValueError("Error: No output format or file specified")
  55. elif args.outputformat not in ('json', 'xml', 'tfw', 'worldfile', 'info'):
  56. raise ValueError("Error: Not a valid output format")
  57. ################################################################
  58. #
  59. # JPEG2000 CHECK STRINGS
  60. # Look for these jp2 strings in file header
  61. #
  62. jp2_header_str = ['ftypjp2', 'jp2 jpx', 'jp2', 'jp2h', 'jp2 J2P1']
  63. # First string to look for in the selected jp2 file
  64. # Basically, we are looking for the start of footer of the file
  65. # which is indicated by this string
  66. #
  67. # We need to convert the string into bytes with encode method
  68. # for the following while loop.
  69. #
  70. mdata_start_str = [ str.encode('gml.data'), str.encode('gxml'), str.encode('fxml') ]
  71. mdata_end_str = str.encode('uuid')
  72. ################################################################
  73. #
  74. # OPEN JPEG2000 file
  75. # Open the image file in read-only binary mode
  76. with open(args.inputfile, 'rb') as f:
  77. ################################################################
  78. #
  79. # JPEG2000 header check
  80. # Check for the first 4 file lines
  81. header_lines = f.readlines()[0:3]
  82. # Declare a variable to store header text string
  83. header_str = ''
  84. # For each header line 1-4...
  85. for a in header_lines:
  86. # Decode binary formatted string (bytes -> string conversion) and ignore any errors we encounter
  87. header_decode = a.decode('utf-8', errors='ignore')
  88. # Store decoded string into header_str variable
  89. header_str += header_decode
  90. # Check existence of each jp2 specific string we have defined in 'jp2_header_str' list above
  91. for jp2_str in jp2_header_str:
  92. # If a jp2 specific string is found, set up a new variable t and break the for loop
  93. if jp2_str in header_str:
  94. t = ''
  95. break
  96. # Variable t is not defined if any valid jp2 string is not found. Thus, this test gives
  97. # us and exception (NameError) if no any jp2 string is found.
  98. try:
  99. t
  100. except NameError:
  101. raise ValueError("Error: Not a valid JPEG2000 file")
  102. ################################################################
  103. #
  104. # PARSE METADATA LINES
  105. # Enumerate all lines, look for metadata start line, using
  106. # string 'mdata_start_str' as a reference
  107. # Break the loop when found. If not found, abort.
  108. #
  109. # Return to the first line again in order to parse footer lines
  110. f.seek(0)
  111. for mstart_num, mstart_line in enumerate(f):
  112. # TODO better formatting for this if statement:
  113. if mdata_start_str[0] in mstart_line or mdata_start_str[1] in mstart_line or mdata_start_str[2] in mstart_line:
  114. break
  115. # else
  116. # TODO echo cannot found metadata start. Abort
  117. # TODO should return value 2
  118. #print(mstart_num)
  119. #sys.exit()
  120. # Enumerate all lines, look for metadata end line, using
  121. # string 'mdata_end_str' as a reference
  122. # Break the loop when found. If not found, abort.
  123. #
  124. f.seek(0)
  125. for mend_num, mend_line in enumerate(f):
  126. if mdata_end_str in mend_line:
  127. break
  128. # else
  129. # TODO echo cannot found metadata end. Abort
  130. # Reset readlines
  131. #
  132. # Convert metadata start line from 'str' type to 'list' with split method
  133. # and merge it with the rest of the metadata lines, defined by readlines method.
  134. # Type of this line list is 'list', thus we use + operator to combine these
  135. # lists.
  136. #
  137. f.seek(0)
  138. metadata_lines = mstart_line.split() + f.readlines()[mstart_num:mend_num]
  139. #mdata_lines = mstart_line.split() + f.readlines()
  140. # Create a new metadata_str variable where we will store our extracted footer strings.
  141. metadata_str = ''
  142. for byteline in metadata_lines:
  143. # Try decode each metadata line to UTF-8 format.
  144. # As these lines are binary code, the conversion will fail for some
  145. # of them. In a case of failure, we let the for loop pass to the next
  146. # line
  147. #
  148. # Add each decoded line into 'footer_str' variable
  149. #
  150. try:
  151. byteline_decoded = byteline.decode('utf-8', errors='strict')
  152. metadata_str += byteline_decoded
  153. except Exception:
  154. pass
  155. f.close()
  156. metadata_xml_all = re.sub(r'(^[^<]*)|([^>]$)', '', metadata_str)
  157. # Create a list element from extracted metadata strings
  158. metadata_xml_all_list = metadata_xml_all.split()
  159. # Find the last element containing <> symbols in metadata_xml_list,
  160. # get the string between them and store it to new variable
  161. for i in reversed(metadata_xml_all_list):
  162. if re.match('</*.*>', i):
  163. last_tag = re.sub('</|>', '', i)
  164. break
  165. # In the original metadata list, find the first occurence of the 'last_tag'
  166. for firstxml_index, value in enumerate(metadata_xml_all_list):
  167. if re.match('<' + last_tag + '>?', value):
  168. break
  169. # For joined metadata list, delete all list entries presented before our 'last_tag'
  170. # Convert list to string format
  171. metadata_parsed_list = metadata_xml_all_list[firstxml_index:]
  172. metadata_joined_list = ' '.join(metadata_parsed_list)
  173. ################################################################
  174. #
  175. class GMLDataParser(object):
  176. def __init__(self, datalist):
  177. self.datalist = datalist
  178. def xmlraw(self):
  179. return xmltodict.parse(self.datalist)
  180. def xmlpretty(self):
  181. return xmltodict.unparse(xmltodict.parse(self.datalist),
  182. pretty=True,indent=" ",newl="\n")
  183. def jsonraw(self):
  184. return json.dumps(xmltodict.parse(self.datalist),
  185. separators=(',', ':'))
  186. def jsonpretty(self):
  187. return json.dumps(xmltodict.parse(self.datalist),
  188. indent=2, sort_keys=True)
  189. # Convert GML metadata to JSON tree object
  190. def jsontree(self):
  191. return json.loads(self.jsonpretty())
  192. # Function to get nested key values from JSON data
  193. # by arainchi
  194. # https://stackoverflow.com/a/19871956
  195. def findkey(self, tree, keyvalue):
  196. if isinstance(tree, list):
  197. for i in tree:
  198. for x in self.findkey(i, keyvalue):
  199. yield x
  200. elif isinstance(tree, dict):
  201. if keyvalue in tree:
  202. yield tree[keyvalue]
  203. for j in tree.values():
  204. for x in self.findkey(j, keyvalue):
  205. yield x
  206. gmlparser = GMLDataParser(metadata_joined_list)
  207. gml_json = gmlparser.jsontree()
  208. def findgmlkey(data, gmlkey, num):
  209. try:
  210. return list(gmlparser.findkey(data, gmlkey))[num]
  211. except:
  212. # In a case we can't parse GML data for this element, return string 'Unknown'
  213. return str("Unknown")
  214. ################################################################
  215. #
  216. # Extract relevant values for TFW file/Worldfile
  217. class GML_Pos_offsetVectors():
  218. # Sample metadata structure of JPEG2000 files (may differ!):
  219. # offsetVector_1 and offsetVector_2:
  220. #
  221. # gml:FeatureCollection
  222. # gml:featureMember
  223. # gml:FeatureCollection
  224. # gml:featureMember
  225. # gml:RectifiedGridCoverage
  226. # gml:rectifiedGridDomain
  227. # gml:RectifiedGrid
  228. # gml:offsetVector[0]
  229. # #text
  230. # gml:offsetVector[1]
  231. # #text
  232. #
  233. # gml_pos:
  234. #
  235. # gml:FeatureCollection
  236. # gml:featureMember
  237. # gml:FeatureCollection
  238. # gml:featureMember
  239. # gml:RectifiedGridCoverage
  240. # gml:rectifiedGridDomain
  241. # gml:RectifiedGrid
  242. # gml:origin
  243. # gml:Point
  244. # gml:pos
  245. # Find offsetVector elements in the file metadata
  246. # These elements include field #text which we are searching for
  247. gml_offsetVector_1 = findgmlkey(gml_json, '#text', 0)
  248. gml_offsetVector_2 = findgmlkey(gml_json, '#text', 1)
  249. # Check whether we have gml:pos or gml:coordinates element in the file metadata
  250. # gml:coordinates is a deprecated type according to opengis.net
  251. try:
  252. gml_pos = findgmlkey(gml_json, 'gml:pos', 0)
  253. except:
  254. gml_pos = findgmlkey(gml_json, 'gml:coordinates', 0)
  255. # Convert gml_pos to list type in a case it is string type
  256. if type(gml_pos) is str:
  257. # Split values, use any other symbol as a separator except for dot, minus prefix and numbers.
  258. gml_pos = re.split('[^\-^\d^\.]+', gml_pos)
  259. # Get semi-major axis of the Earth from ESPG metadata
  260. # TODO get this actually from metadata!
  261. #try:
  262. # Try to get the value
  263. # Fallback value
  264. #except:
  265. earth_axis_semimajor = 6378137
  266. # Estimated meters for one degree on Earth surface for used ellipsoid model
  267. dec_mult = float((2 * math.pi * earth_axis_semimajor) / 360)
  268. # Declare a new list 'l'
  269. l = []
  270. for d in (gml_offsetVector_1, gml_offsetVector_2):
  271. if type(d) is str:
  272. d = re.split('[^\-^\d^\.]+', d)
  273. # Add extracted value to list 'l'
  274. l += d
  275. # Assumed length of list gml_pos is either 4 (gml:pos) or 6 (gml:coordinates).
  276. # We must treat these list types differently.
  277. # In a case length is either of those, return error.
  278. #
  279. # Map correct gml_pos values into new array 'g'
  280. #
  281. g = [0] * 4
  282. if len(l) == 4:
  283. g[0] = l[0]
  284. g[1] = l[1]
  285. g[2] = l[2]
  286. g[3] = l[3]
  287. elif len(l) == 6:
  288. g[0] = l[3]
  289. g[1] = l[4] # TODO is this correct index?
  290. g[2] = l[2] # TODO is this correct index?
  291. g[3] = l[1]
  292. else:
  293. raise ValueError("Error: Incorrect worldfile metadata definition for rotational and pixel size values")
  294. # World file definition
  295. # https://en.wikipedia.org/wiki/World_file
  296. #
  297. # g[0] = pixel size of X-axis in map units
  298. # g[1] = Y-axis rotation
  299. # g[2] = X-axis rotation
  300. # g[3] = pixel size of Y-axis in map units
  301. # gml_pos[0] = X-coordinate of the center of the upper left pixel
  302. # gml_pos[1] = Y-coordinate of the center of the upper left pixel
  303. # TODO should gml_pos[1] value be decreased by -1?
  304. gml_posinfo = GML_Pos_offsetVectors()
  305. ################################################################
  306. #
  307. # ESPG INFORMATION RETRIEVAL
  308. class ESPGRetrieval():
  309. #def __init__(self):
  310. #try:
  311. espg_number = int(findgmlkey(gml_json, '@srsName', 0).split(':')[-1])
  312. #except:
  313. #Warn("Warning: Not a valid ESPG number found")
  314. #return
  315. espg_file = str(espg_number) + '.xml'
  316. def ESPG_retrieve():
  317. if not os.path.isfile('./' + espg_file):
  318. # ESPG XML data URL
  319. espg_url = 'http://epsg.io/' + espg_file
  320. urlreq = URL.Request(
  321. espg_url,
  322. data = None,
  323. headers={
  324. 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 #(KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'
  325. }
  326. )
  327. # Try to download the XML file and save it
  328. with open(espg_file, 'w') as espg_of:
  329. try:
  330. espg_of.write(str(URL.urlopen(urlreq).read().decode('utf-8')))
  331. except:
  332. Warn("Warning: Could not download ESPG metadata")
  333. espg_of.close()
  334. @staticmethod
  335. def ESPG_read():
  336. with open(espg_file, 'r') as espg_rf:
  337. espg_metadata_list = espg_rf.read()
  338. espgparser = GMLDataParser(espg_metadata_list)
  339. espg_json = espgparser.jsonpretty()
  340. gml_datum = findgmlkey(espg_json, 'gml:datumName', 0)
  341. gml_ellipsoid = findgmlkey(espg_json, 'gml:ellipsoidName', 0)
  342. gml_coordsys = findgmlkey(espg_json, 'gml:srsName', 0)
  343. gml_axis_1_abbrev = findgmlkey(espg_json, 'gml:axisAbbrev', 0)
  344. gml_axis_1_dir = findgmlkey(espg_json, 'gml:axisDirection', 0).capitalize()
  345. gml_axis_2_abbrev = findgmlkey(espg_json, 'gml:axisAbbrev', 1)
  346. gml_axis_2_dir = findgmlkey(espg_json, 'gml:axisDirection', 1).capitalize()
  347. # TODO. Have child element #text which contains the actual value
  348. gml_semimajor_axis = findgmlkey(espg_json, 'gml:semiMajorAxis', 0)
  349. gml_inverse_flat = findgmlkey(espg_json, 'gml:inverseFlattening', 0)
  350. #espg_data = ESPGRetrieval()
  351. #espg_data.ESPG_read()
  352. #sys.exit()
  353. ################################################################
  354. #
  355. # PHYSICAL AREA SIZE CALCULATOR
  356. def axisCalculator():
  357. # Axis-based data
  358. try:
  359. x_high = float(findgmlkey(gml_json, 'gml:upperCorner', 0).split()[0])
  360. x_low = float(findgmlkey(gml_json, 'gml:lowerCorner', 0).split()[0])
  361. y_high = float(findgmlkey(gml_json, 'gml:upperCorner', 0).split()[1])
  362. y_low = float(findgmlkey(gml_json, 'gml:lowerCorner', 0).split()[1])
  363. except:
  364. # Pixel-based data
  365. try:
  366. x_high = float(findgmlkey(gml_json, 'gml:high', 0).split()[0])
  367. x_low = float(findgmlkey(gml_json, 'gml:low', 0).split()[0])
  368. y_high = float(findgmlkey(gml_json, 'gml:high', 0).split()[1])
  369. y_low = float(findgmlkey(gml_json, 'gml:low', 0).split()[1])
  370. except:
  371. x_high = "Unknown"
  372. x_low = "Unknown"
  373. y_high = "Unknown"
  374. y_low = "Unknown"
  375. for t in (x_high, x_low, y_high, y_low):
  376. if type(t) is not float:
  377. return list([
  378. 'Unknown',
  379. 'Unknown',
  380. 'Unknown',
  381. 'Unknown',
  382. 'Unknown'
  383. ])
  384. def RadtoGrad(num):
  385. rad_to_deg = 180 * num / math.pi
  386. deg_to_grad = 10 * rad_to_deg / 9
  387. return deg_to_grad
  388. ###############################
  389. # X and Y lengths
  390. # Area size in km^2
  391. x_length = x_high - x_low
  392. y_length = y_high - y_low
  393. xy_area = (x_length * y_length) / 1000000
  394. ###############################
  395. # Inverse geodetic calculation
  396. xy_hypotenuse = math.sqrt(x_length ** 2 + y_length ** 2)
  397. inverse_geod_angle = RadtoGrad(math.atan2(y_length, x_length))
  398. ###############################
  399. return list([
  400. format(x_length, '.2f'),
  401. format(y_length, '.2f'),
  402. format(xy_area, '.2f'),
  403. format(xy_hypotenuse, '.2f'),
  404. format(inverse_geod_angle, '.2f')
  405. ])
  406. gml_calc = axisCalculator()
  407. ################################################################
  408. #
  409. # TFW FORMAT PARSE
  410. def tfwparse():
  411. worldfile_values = gml_posinfo.g + gml_posinfo.gml_pos
  412. worldfile_out = ''
  413. for value in worldfile_values:
  414. worldfile_out += format(float(value)) + '\n'
  415. # Return gml_out, remove last empty line
  416. return worldfile_out[:-1]
  417. ################################################################
  418. #
  419. # INFORMATION PARSE
  420. # Extract all important metadata elements
  421. def infoparse():
  422. def getkeys():
  423. # TODO these might or might not be defined in JSON data!
  424. infolist = [
  425. ['Image Name', args.inputfile.split('.')[0] ],
  426. ['Source Name', findgmlkey(gml_json, '@srsName', 0) ],
  427. ['GML File Name', findgmlkey(gml_json, 'gml:fileName', 0) ],
  428. ['File Structure', findgmlkey(gml_json, 'gml:fileStructure', 0) ],
  429. ['Rectified Grid Coverage ID', findgmlkey(gml_json, '@dimension', 0) ],
  430. #['Axis Names', ' '.join(findgmlkey('gml:axisName', 0)) ],
  431. ['Map Scale', ],
  432. ['Upper Corner Coordinates', findgmlkey(gml_json, 'gml:upperCorner', 0) ],
  433. ['Lower Corner Coordinates', findgmlkey(gml_json, 'gml:lowerCorner', 0) ],
  434. ['X-axis Length in Meters', gml_calc[0] ],
  435. ['Y-axis Length in Meters', gml_calc[1] ],
  436. ['Area Size in Square Kilometers', gml_calc[2] ],
  437. ['Distance of Corners Points in Meters', gml_calc[3] ],
  438. ['Azimuth Angle of Corner Points in Gradians', gml_calc[4] ],
  439. ['Grid Envelope High', findgmlkey(gml_json, 'gml:high', 0) ],
  440. ['Grid Envelope Low', findgmlkey(gml_json, 'gml:low', 0) ],
  441. ['X-axis Pixel Size in Map Units', gml_posinfo.g[0] ],
  442. ['Y-axis pixel size in Map Units', gml_posinfo.g[3] ],
  443. ['X-axis Rotation', gml_posinfo.g[1] ],
  444. ['Y-axis Rotation', gml_posinfo.g[2] ],
  445. ['Upper Left Pixel X-coordinate Center in Map Units', gml_posinfo.gml_pos[0] ],
  446. ['Upper Left Pixel Y-coordinate Center in Map Units', gml_posinfo.gml_pos[1] ]
  447. #['EPSG Projection Code',
  448. #['Projection Name',
  449. #['Projection Area',
  450. #['Image Area',
  451. ]
  452. #row_format ="{:>15}" * (len(teams_list) + 1)
  453. #print(row_format.format("", *teams_list))
  454. #for team, row in zip(teams_list, data):
  455. # print row_format.format(team, *row)
  456. #for i in infolist:
  457. # print(i)
  458. #sys.exit()
  459. for i in range(len(infolist)):
  460. for j in range(len(infolist[i])):
  461. print(infolist[i][j], end=' ')
  462. print('')
  463. getkeys()
  464. #print(gml_source)
  465. ################################
  466. #
  467. # OUTPUT WRITING
  468. try:
  469. args.outputfile
  470. with open(args.outputfile, 'w') as o:
  471. if args.outputformat in 'xml':
  472. if args.formatting in 'pretty':
  473. o.write(gmlparser.xmlpretty())
  474. elif args.formatting in 'raw':
  475. o.write(gmlparser.jsonraw())
  476. else:
  477. raise ValueError("Error: Undefined formatting")
  478. elif args.outputformat in 'json':
  479. if args.formatting in 'pretty':
  480. o.write(gmlparser.jsonpretty())
  481. elif args.formatting in 'raw':
  482. o.write(gmlparser.jsonraw())
  483. else:
  484. raise ValueError("Error: invalid data formatting")
  485. elif args.outputformat in 'tfw' or args.outputformat in 'worldfile':
  486. o.write(tfwparse())
  487. elif args.outputformat in 'info':
  488. o.write(infoparse())
  489. else:
  490. raise ValueError("Error: invalid data format")
  491. except:
  492. if args.outputformat in 'xml':
  493. if args.formatting in 'pretty':
  494. print(gmlparser.xmlpretty())
  495. elif args.formatting in 'raw':
  496. print(gmlparser.xmlraw())
  497. else:
  498. raise ValueError("Error: Undefined formatting")
  499. elif args.outputformat in 'json':
  500. if args.formatting in 'pretty':
  501. print(gmlparser.jsonpretty())
  502. elif args.formatting in 'raw':
  503. print(gmlparser.jsonraw())
  504. else:
  505. raise ValueError("Error: Undefined formatting")
  506. elif args.outputformat in 'tfw' or args.outputformat in 'worldfile':
  507. print(tfwparse())
  508. elif args.outputformat in 'info':
  509. print(infoparse())
  510. else:
  511. raise ValueError("Error: invalid data format")