parser.hpp 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529
  1. #ifndef BOOST_PROPERTY_TREE_DETAIL_JSON_PARSER_PARSER_HPP
  2. #define BOOST_PROPERTY_TREE_DETAIL_JSON_PARSER_PARSER_HPP
  3. #include <boost/property_tree/json_parser/error.hpp>
  4. #include <boost/core/ref.hpp>
  5. #include <boost/bind/bind.hpp>
  6. #include <iterator>
  7. #include <sstream>
  8. #include <string>
  9. namespace boost { namespace property_tree {
  10. namespace json_parser { namespace detail
  11. {
  12. template <typename Encoding, typename Iterator, typename Sentinel>
  13. class source
  14. {
  15. public:
  16. typedef typename std::iterator_traits<Iterator>::value_type
  17. code_unit;
  18. typedef bool (Encoding::*encoding_predicate)(code_unit c) const;
  19. explicit source(Encoding& encoding) : encoding(encoding) {}
  20. template <typename Range>
  21. void set_input(const std::string& filename, const Range& r)
  22. {
  23. this->filename = filename;
  24. cur = r.begin();
  25. end = r.end();
  26. // Note that there is no backtracking, so if e.g. a UTF-8 file
  27. // starts with something that initially looks like a BOM but isn't,
  28. // there's trouble.
  29. // However, no valid JSON file can start with a UTF-8 EF byte.
  30. encoding.skip_introduction(cur, end);
  31. line = 1;
  32. offset = 0;
  33. }
  34. bool done() const { return cur == end; }
  35. void parse_error(const char* msg) {
  36. BOOST_PROPERTY_TREE_THROW(
  37. json_parser_error(msg, filename, line));
  38. }
  39. void next() {
  40. if (encoding.is_nl(*cur)) {
  41. ++line;
  42. offset = 0;
  43. } else {
  44. ++offset;
  45. }
  46. ++cur;
  47. }
  48. template <typename Action>
  49. bool have(encoding_predicate p, Action& a) {
  50. bool found = cur != end && (encoding.*p)(*cur);
  51. if (found) {
  52. a(*cur);
  53. next();
  54. }
  55. return found;
  56. }
  57. bool have(encoding_predicate p) {
  58. DoNothing n;
  59. return have(p, n);
  60. }
  61. template <typename Action>
  62. void expect(encoding_predicate p, const char* msg, Action& a) {
  63. if (!have(p, a)) {
  64. parse_error(msg);
  65. }
  66. }
  67. void expect(encoding_predicate p, const char* msg) {
  68. DoNothing n;
  69. expect(p, msg, n);
  70. }
  71. code_unit need_cur(const char* msg) {
  72. if (cur == end) {
  73. parse_error(msg);
  74. }
  75. return *cur;
  76. }
  77. Iterator& raw_cur() { return cur; }
  78. Sentinel raw_end() { return end; }
  79. private:
  80. struct DoNothing {
  81. void operator ()(code_unit) const {}
  82. };
  83. Encoding& encoding;
  84. Iterator cur;
  85. Sentinel end;
  86. std::string filename;
  87. int line;
  88. int offset;
  89. };
  90. template <typename Callbacks, typename Encoding, typename Iterator,
  91. typename = typename std::iterator_traits<Iterator>
  92. ::iterator_category>
  93. class number_callback_adapter
  94. {
  95. public:
  96. number_callback_adapter(Callbacks& callbacks, Encoding& encoding,
  97. Iterator& cur)
  98. : callbacks(callbacks), encoding(encoding), first(cur), cur(cur)
  99. {}
  100. void operator ()(typename Encoding::external_char) {}
  101. void finish() const {
  102. callbacks.on_number(encoding.to_internal(first, cur));
  103. }
  104. private:
  105. number_callback_adapter(const number_callback_adapter&);
  106. Callbacks& callbacks;
  107. Encoding& encoding;
  108. Iterator first;
  109. Iterator& cur;
  110. };
  111. template <typename Callbacks, typename Encoding, typename Iterator>
  112. class number_callback_adapter<Callbacks, Encoding, Iterator,
  113. std::input_iterator_tag>
  114. {
  115. public:
  116. number_callback_adapter(Callbacks& callbacks, Encoding& encoding,
  117. Iterator&)
  118. : callbacks(callbacks), encoding(encoding), first(true)
  119. {}
  120. void operator ()(typename Encoding::external_char c) {
  121. if (first) {
  122. callbacks.on_begin_number();
  123. first = false;
  124. }
  125. callbacks.on_digit(encoding.to_internal_trivial(c));
  126. }
  127. void finish() const {
  128. callbacks.on_end_number();
  129. }
  130. private:
  131. number_callback_adapter(const number_callback_adapter&);
  132. Callbacks& callbacks;
  133. Encoding& encoding;
  134. bool first;
  135. };
  136. template <typename Callbacks, typename Encoding, typename Iterator,
  137. typename = typename std::iterator_traits<Iterator>
  138. ::iterator_category>
  139. class string_callback_adapter
  140. {
  141. public:
  142. string_callback_adapter(Callbacks& callbacks, Encoding& encoding,
  143. Iterator& cur)
  144. : callbacks(callbacks), encoding(encoding), cur(cur),
  145. run_begin(cur)
  146. {}
  147. void start_run() {
  148. run_begin = cur;
  149. }
  150. void finish_run() {
  151. callbacks.on_code_units(encoding.to_internal(run_begin, cur));
  152. }
  153. template <typename Sentinel, typename EncodingErrorFn>
  154. void process_codepoint(Sentinel end, EncodingErrorFn error_fn) {
  155. encoding.skip_codepoint(cur, end, error_fn);
  156. }
  157. private:
  158. string_callback_adapter(const string_callback_adapter&);
  159. Callbacks& callbacks;
  160. Encoding& encoding;
  161. Iterator& cur;
  162. Iterator run_begin;
  163. };
  164. template <typename Callbacks, typename Encoding, typename Iterator>
  165. class string_callback_adapter<Callbacks, Encoding, Iterator,
  166. std::input_iterator_tag>
  167. {
  168. public:
  169. string_callback_adapter(Callbacks& callbacks, Encoding& encoding,
  170. Iterator& cur)
  171. : callbacks(callbacks), encoding(encoding), cur(cur)
  172. {}
  173. void start_run() {}
  174. void finish_run() {}
  175. template <typename Sentinel, typename EncodingErrorFn>
  176. void process_codepoint(Sentinel end, EncodingErrorFn error_fn) {
  177. encoding.transcode_codepoint(cur, end,
  178. boost::bind(&Callbacks::on_code_unit,
  179. boost::ref(callbacks), boost::placeholders::_1),
  180. error_fn);
  181. }
  182. private:
  183. string_callback_adapter(const string_callback_adapter&);
  184. Callbacks& callbacks;
  185. Encoding& encoding;
  186. Iterator& cur;
  187. };
  188. template <typename Callbacks, typename Encoding, typename Iterator,
  189. typename Sentinel>
  190. class parser
  191. {
  192. typedef detail::number_callback_adapter<Callbacks, Encoding, Iterator>
  193. number_adapter;
  194. typedef detail::string_callback_adapter<Callbacks, Encoding, Iterator>
  195. string_adapter;
  196. typedef detail::source<Encoding, Iterator, Sentinel> source;
  197. typedef typename source::code_unit code_unit;
  198. public:
  199. parser(Callbacks& callbacks, Encoding& encoding)
  200. : callbacks(callbacks), encoding(encoding), src(encoding)
  201. {}
  202. template <typename Range>
  203. void set_input(const std::string& filename, const Range& r) {
  204. src.set_input(filename, r);
  205. }
  206. void finish() {
  207. skip_ws();
  208. if (!src.done()) {
  209. parse_error("garbage after data");
  210. }
  211. }
  212. void parse_value() {
  213. if (parse_object()) return;
  214. if (parse_array()) return;
  215. if (parse_string()) return;
  216. if (parse_boolean()) return;
  217. if (parse_null()) return;
  218. if (parse_number()) return;
  219. parse_error("expected value");
  220. }
  221. bool parse_null() {
  222. skip_ws();
  223. if (!have(&Encoding::is_n)) {
  224. return false;
  225. }
  226. expect(&Encoding::is_u, "expected 'null'");
  227. expect(&Encoding::is_l, "expected 'null'");
  228. expect(&Encoding::is_l, "expected 'null'");
  229. callbacks.on_null();
  230. return true;
  231. }
  232. bool parse_boolean() {
  233. skip_ws();
  234. if (have(&Encoding::is_t)) {
  235. expect(&Encoding::is_r, "expected 'true'");
  236. expect(&Encoding::is_u, "expected 'true'");
  237. expect(&Encoding::is_e, "expected 'true'");
  238. callbacks.on_boolean(true);
  239. return true;
  240. }
  241. if (have(&Encoding::is_f)) {
  242. expect(&Encoding::is_a, "expected 'false'");
  243. expect(&Encoding::is_l, "expected 'false'");
  244. expect(&Encoding::is_s, "expected 'false'");
  245. expect(&Encoding::is_e, "expected 'false'");
  246. callbacks.on_boolean(false);
  247. return true;
  248. }
  249. return false;
  250. }
  251. bool parse_number() {
  252. skip_ws();
  253. number_adapter adapter(callbacks, encoding, src.raw_cur());
  254. bool started = false;
  255. if (have(&Encoding::is_minus, adapter)) {
  256. started = true;
  257. }
  258. if (!have(&Encoding::is_0, adapter) && !parse_int_part(adapter)) {
  259. if (started) {
  260. parse_error("expected digits after -");
  261. }
  262. return false;
  263. }
  264. parse_frac_part(adapter);
  265. parse_exp_part(adapter);
  266. adapter.finish();
  267. return true;
  268. }
  269. bool parse_string() {
  270. skip_ws();
  271. if (!have(&Encoding::is_quote)) {
  272. return false;
  273. }
  274. callbacks.on_begin_string();
  275. string_adapter adapter(callbacks, encoding, src.raw_cur());
  276. while (!encoding.is_quote(need_cur("unterminated string"))) {
  277. if (encoding.is_backslash(*src.raw_cur())) {
  278. adapter.finish_run();
  279. next();
  280. parse_escape();
  281. adapter.start_run();
  282. } else {
  283. adapter.process_codepoint(src.raw_end(),
  284. boost::bind(&parser::parse_error,
  285. this, "invalid code sequence"));
  286. }
  287. }
  288. adapter.finish_run();
  289. callbacks.on_end_string();
  290. next();
  291. return true;
  292. }
  293. bool parse_array() {
  294. skip_ws();
  295. if (!have(&Encoding::is_open_bracket)) {
  296. return false;
  297. }
  298. callbacks.on_begin_array();
  299. skip_ws();
  300. if (have(&Encoding::is_close_bracket)) {
  301. callbacks.on_end_array();
  302. return true;
  303. }
  304. do {
  305. parse_value();
  306. skip_ws();
  307. } while (have(&Encoding::is_comma));
  308. expect(&Encoding::is_close_bracket, "expected ']' or ','");
  309. callbacks.on_end_array();
  310. return true;
  311. }
  312. bool parse_object() {
  313. skip_ws();
  314. if (!have(&Encoding::is_open_brace)) {
  315. return false;
  316. }
  317. callbacks.on_begin_object();
  318. skip_ws();
  319. if (have(&Encoding::is_close_brace)) {
  320. callbacks.on_end_object();
  321. return true;
  322. }
  323. do {
  324. if (!parse_string()) {
  325. parse_error("expected key string");
  326. }
  327. skip_ws();
  328. expect(&Encoding::is_colon, "expected ':'");
  329. parse_value();
  330. skip_ws();
  331. } while (have(&Encoding::is_comma));
  332. expect(&Encoding::is_close_brace, "expected '}' or ','");
  333. callbacks.on_end_object();
  334. return true;
  335. }
  336. private:
  337. typedef typename source::encoding_predicate encoding_predicate;
  338. void parse_error(const char* msg) { src.parse_error(msg); }
  339. void next() { src.next(); }
  340. template <typename Action>
  341. bool have(encoding_predicate p, Action& a) { return src.have(p, a); }
  342. bool have(encoding_predicate p) { return src.have(p); }
  343. template <typename Action>
  344. void expect(encoding_predicate p, const char* msg, Action& a) {
  345. src.expect(p, msg, a);
  346. }
  347. void expect(encoding_predicate p, const char* msg) {
  348. src.expect(p, msg);
  349. }
  350. code_unit need_cur(const char* msg) { return src.need_cur(msg); }
  351. void skip_ws() {
  352. while (have(&Encoding::is_ws)) {
  353. }
  354. }
  355. bool parse_int_part(number_adapter& action) {
  356. if (!have(&Encoding::is_digit0, action)) {
  357. return false;
  358. }
  359. parse_digits(action);
  360. return true;
  361. }
  362. void parse_frac_part(number_adapter& action) {
  363. if (!have(&Encoding::is_dot, action)) {
  364. return;
  365. }
  366. expect(&Encoding::is_digit, "need at least one digit after '.'",
  367. action);
  368. parse_digits(action);
  369. }
  370. void parse_exp_part(number_adapter& action) {
  371. if (!have(&Encoding::is_eE, action)) {
  372. return;
  373. }
  374. have(&Encoding::is_plusminus, action);
  375. expect(&Encoding::is_digit, "need at least one digit in exponent",
  376. action);
  377. parse_digits(action);
  378. }
  379. void parse_digits(number_adapter& action) {
  380. while (have(&Encoding::is_digit, action)) {
  381. }
  382. }
  383. void parse_escape() {
  384. if (have(&Encoding::is_quote)) {
  385. feed(0x22);
  386. } else if (have(&Encoding::is_backslash)) {
  387. feed(0x5c);
  388. } else if (have(&Encoding::is_slash)) {
  389. feed(0x2f);
  390. } else if (have(&Encoding::is_b)) {
  391. feed(0x08); // backspace
  392. } else if (have(&Encoding::is_f)) {
  393. feed(0x0c); // formfeed
  394. } else if (have(&Encoding::is_n)) {
  395. feed(0x0a); // line feed
  396. } else if (have(&Encoding::is_r)) {
  397. feed(0x0d); // carriage return
  398. } else if (have(&Encoding::is_t)) {
  399. feed(0x09); // horizontal tab
  400. } else if (have(&Encoding::is_u)) {
  401. parse_codepoint_ref();
  402. } else {
  403. parse_error("invalid escape sequence");
  404. }
  405. }
  406. unsigned parse_hex_quad() {
  407. unsigned codepoint = 0;
  408. for (int i = 0; i < 4; ++i) {
  409. int value = encoding.decode_hexdigit(
  410. need_cur("invalid escape sequence"));
  411. if (value < 0) {
  412. parse_error("invalid escape sequence");
  413. }
  414. codepoint *= 16;
  415. codepoint += value;
  416. next();
  417. }
  418. return codepoint;
  419. }
  420. static bool is_surrogate_high(unsigned codepoint) {
  421. return (codepoint & 0xfc00) == 0xd800;
  422. }
  423. static bool is_surrogate_low(unsigned codepoint) {
  424. return (codepoint & 0xfc00) == 0xdc00;
  425. }
  426. static unsigned combine_surrogates(unsigned high, unsigned low) {
  427. return 0x010000 + (((high & 0x3ff) << 10) | (low & 0x3ff));
  428. }
  429. void parse_codepoint_ref() {
  430. unsigned codepoint = parse_hex_quad();
  431. if (is_surrogate_low(codepoint)) {
  432. parse_error("invalid codepoint, stray low surrogate");
  433. }
  434. if (is_surrogate_high(codepoint)) {
  435. expect(&Encoding::is_backslash,
  436. "invalid codepoint, stray high surrogate");
  437. expect(&Encoding::is_u,
  438. "expected codepoint reference after high surrogate");
  439. int low = parse_hex_quad();
  440. if (!is_surrogate_low(low)) {
  441. parse_error("expected low surrogate after high surrogate");
  442. }
  443. codepoint = combine_surrogates(codepoint, low);
  444. }
  445. feed(codepoint);
  446. }
  447. void feed(unsigned codepoint) {
  448. encoding.feed_codepoint(codepoint,
  449. boost::bind(&Callbacks::on_code_unit,
  450. boost::ref(callbacks), boost::placeholders::_1));
  451. }
  452. Callbacks& callbacks;
  453. Encoding& encoding;
  454. source src;
  455. };
  456. }}}}
  457. #endif