本篇內(nèi)容主要講解“如何使用Python寫一個(gè)簡單的JSONParser”,感興趣的朋友不妨來看看。本文介紹的方法操作簡單快捷,實(shí)用性強(qiáng)。下面就讓小編來帶大家學(xué)習(xí)“如何使用Python寫一個(gè)簡單的JSONParser”吧!
成都創(chuàng)新互聯(lián)公司服務(wù)緊隨時(shí)代發(fā)展步伐,進(jìn)行技術(shù)革新和技術(shù)進(jìn)步,經(jīng)過十余年的發(fā)展和積累,已經(jīng)匯集了一批資深網(wǎng)站策劃師、設(shè)計(jì)師、專業(yè)的網(wǎng)站實(shí)施團(tuán)隊(duì)以及高素質(zhì)售后服務(wù)人員,并且完全形成了一套成熟的業(yè)務(wù)流程,能夠完全依照客戶要求對網(wǎng)站進(jìn)行做網(wǎng)站、成都網(wǎng)站建設(shè)、建設(shè)、維護(hù)、更新和改版,實(shí)現(xiàn)客戶網(wǎng)站對外宣傳展示的首要目的,并為客戶企業(yè)品牌互聯(lián)網(wǎng)化提供全面的解決方案。
JSON 的詞法分析,我主要是參考上面這個(gè)截圖里面的方式,自己寫了一個(gè)簡單的示例。寫得比較簡單,應(yīng)該說它只能支持 JSON 的一個(gè)簡單子集。
這里 TOKEN 的種類,參考了 https://json.org,不過它的 JSON 的語法格式是帶 whitespace 的,我不習(xí)慣處理這個(gè),所以沒有參考它的語法。經(jīng)過詞法分析之后,過濾掉了 空格、換行、制表符,我這里就是簡單的丟棄不處理。
json_tokenizer.py
使用正則表達(dá)式來進(jìn)行 JSON 的詞法分析。
import json import re from typing import Dict, List, Union # TOKEN 的種類 LEFT_BRACE = "LEFT_BRACE" # { RIGHT_BRACE = "RIGHT_BRACE" # } LEFT_BRACKET = "LEFT_BRACKET" # ] RIGHT_BRACKET = "RIGHT_BRACKET" # [ COLON = "COLON" # : COMMA = "COMMA" # , NUMBER = "NUMBER" # ".*?" STRING = "STRING" # [1-9]\d* BOOL = "BOOL" # true/false NULL = "NULL" # null NEWLINE = "NEWLINE" # \n SKIP = "SKIP" # ' ', '\t' MISMATCH = "MISMATCH" # mismatch # 處理 token 的正則 token_specification = [ ('LEFT_BRACE', r'[{]'), ('RIGHT_BRACE', r'[}]'), ('LEFT_BRACKET', r'[\[]'), ('RIGHT_BRACKET', r'[\]]'), ('COLON', r'[:]'), ('COMMA', r'[,]'), ('NUMBER', r'-?[1-9]+[0-9]*'), ('STRING', r'".*?"'), ('BOOL', r'(true)|(false)'), ('NULL', r'null'), ('NEWLINE', r'\n'), ('SKIP', r'[ \t]'), ('MISMATCH', r'.') ] tok_regex = '|'.join('(?P<%s>%s)' % pair for pair in token_specification) print("Debug: ", tok_regex) def process(kind: str, value: str) -> Dict[str, Union[str, bool, int, None]]: """ 處理輸入的 kind 和 value,并生成 Dict 對象,簡單表示 token 對象 """ if kind == STRING: # 去掉外層的雙引號,暫時(shí)沒有比較好的方式 return {"kind": kind, "value": value[1:-1]} if kind == NUMBER: return {"kind": kind, "value": int(value)} if kind == BOOL: if value == "true": return {"kind": kind, "value": True} else: return {"kind": kind, "value": False} if kind == NULL: return {"kind": kind, "value": None} return {"kind": kind, "value": value} def tokenizer(json_str: str) -> List[Dict[str, Union[str, bool, int, None]]]: """ tokenizer """ tokens = [] for m in re.finditer(tok_regex, json_str): # 獲取 token 的類型 kind = m.lastgroup # 獲取 token 的值 value = m.group() if kind == MISMATCH: raise Exception("json format is error") if kind == NEWLINE: continue if kind == SKIP: continue token = process(kind=kind, value=value) tokens.append(token) return tokens if __name__ == "__main__": json_doc = open("./demo.json", "r", encoding="utf-8").read() tokens = tokenizer(json_doc) if tokens: json.dump(tokens, open("./json_tokens.json", "w", encoding="utf-8"), ensure_ascii=False)
我這里把輸入、輸出數(shù)據(jù)全部放在文檔里面了,下面我貼一下我輸入數(shù)據(jù)和部分輸出數(shù)據(jù)。
demo.json
{ "name": "小黑子", "age": 3, "gender": false, "other_info": { "friends": [ "嘎子", "潘叔", "狗" ], "declaration": "練習(xí)時(shí)長兩年半", "hobbies": [ "唱", "跳", "rap", "籃球????" ] } }
json_token.json 部分?jǐn)?shù)據(jù),數(shù)據(jù)我格式化了,所以比較長,這里只截取一部分。
json_parser.py
對上一步生成的 token 序列,進(jìn)行 parser,生成 JSON 對應(yīng)的 Dict 對象。parser 的實(shí)現(xiàn)參考了 antlr4 的 json 語法文件,它去掉了 whitespace,處理起來更簡單一點(diǎn)。
import json from typing import Dict, Union # TOKEN 的種類 LEFT_BRACE = "LEFT_BRACE" # { RIGHT_BRACE = "RIGHT_BRACE" # } LEFT_BRACKET = "LEFT_BRACKET" # ] RIGHT_BRACKET = "RIGHT_BRACKET" # [ COLON = "COLON" # : COMMA = "COMMA" # , NUMBER = "NUMBER" # ".*?" STRING = "STRING" # [1-9]\d* BOOL = "BOOL" # true/false NULL = "NULL" # null class Token(object): """為了簡單,就不創(chuàng)建這個(gè)了""" class JSON_Parser(object): """ JSON_Parser the class aims parse input token sequence into a python object or array. """ def __init__(self, tokens) -> None: self.index = 0 self.tokens = tokens def get_token(self) -> Dict[str, Union[str, int, bool, None]]: """ get current's token """ if self.index < len(self.tokens): return self.tokens[self.index] else: raise Exception("index out of range.") def move_token(self) -> Dict[str, Union[str, int, bool, None]]: """ move to next token and return it """ if self.index + 1 < len(self.tokens): self.index = self.index + 1 return self.tokens[self.index] else: raise Exception("index out of range.") def parse(self): """ parse whole json """ token = self.get_token() if token.get("kind") == LEFT_BRACE: return self.parse_obj() elif token.get("kind") == LEFT_BRACKET: return self.parse_arr() else: raise Exception("error json, neither object or array.") def parse_obj(self): """ parse object """ obj = {} token = self.move_token() kind = token.get("kind") # '{' '}' if kind == RIGHT_BRACE: return obj # '{' pair (',' pair)* '}' name, val = self.parse_pair() obj[name] = val while self.index < len(self.tokens): token = self.move_token() kind = token.get("kind") if kind == COMMA: self.move_token() name, val = self.parse_pair() obj[name] = val elif kind == RIGHT_BRACE: return obj else: raise Exception("parse object encounter error") def parse_arr(self): """ parse array """ arr = [] token = self.move_token() kind = token.get("kind") # '[' ']' if kind == RIGHT_BRACE: return arr # '[' value (',' value)* ']' val = self.parse_value() arr.append(val) while self.index < len(self.tokens): token = self.move_token() kind = token.get("kind") if kind == COMMA: self.move_token() val = self.parse_value() arr.append(val) elif kind == RIGHT_BRACKET: return arr else: raise Exception("parse array encounter error") def parse_value(self): """ parse value """ token = self.get_token() kind = token.get("kind") if kind == LEFT_BRACE: return self.parse_obj() elif kind == LEFT_BRACKET: return self.parse_arr() elif kind == STRING or kind == NUMBER or kind == BOOL: return token.get("value") elif kind == NULL: return else: raise Exception("encounter unexcepted token") def parse_pair(self): """ parse pair """ token = self.get_token() kind = token.get("kind") name = token.get("value") # STRING ':' value if kind == STRING: token = self.move_token() kind = token.get("kind") if kind == COLON: token = self.move_token() return name, self.parse_value() raise Exception("parse pair encounter error") if __name__ == "__main__": # json token 文件路徑 TOKEN_PATH = "./json_tokens.json" # 讀取 token 序列 input_tokens = [token for token in json.load( open(TOKEN_PATH, "r", encoding="utf-8"))] if not input_tokens: raise Exception("input token sequence is empty") # 調(diào)試的時(shí)候,用來查表的,很方便定位到 index 走到哪一個(gè) token 了 for i, tok in enumerate(input_tokens): print(f"debug {i:2d} --> {tok}") print("\n===========================================\n") parser = JSON_Parser(tokens=input_tokens) json_obj = parser.parse() # 再將 object 轉(zhuǎn)成 json 并格式化后輸出 print(json.dumps(json_obj, ensure_ascii=False, indent=4))
輸出結(jié)果:
到此,相信大家對“如何使用Python寫一個(gè)簡單的JSONParser”有了更深的了解,不妨來實(shí)際操作一番吧!這里是創(chuàng)新互聯(lián)網(wǎng)站,更多相關(guān)內(nèi)容可以進(jìn)入相關(guān)頻道進(jìn)行查詢,關(guān)注我們,繼續(xù)學(xué)習(xí)!