Sunday, June 21, 2015

Parse food recipes from Python with pylangparser

Link to project page

from pylangparser import * QTY = Symbols(r'\d+') QTY_HALF = Symbols(r'(\d+ ){0,1}1/2') QTYS = QTY_HALF & QTY ONION = Symbols(r'onion') ROAST = Symbols(r'roast') SOY_SAUCE = Symbols(r'soy sauce') STALK_CELERY = Symbols(r'stalk celery') WATER = Symbols(r'water') YOGURT = Symbols(r'yogurt') FAT = Symbols(r'\d+%') INGREDIENT = \ ONION & \ ROAST & \ SOY_SAUCE & \ STALK_CELERY & \ WATER & \ YOGURT RANDOM_WORD = Symbols(r'[a-zA-Z]+')
LBS = Symbols(r'lbs') TABLESPOON = Symbols(r'tablespoon') TABLESPOONS = Symbols(r'tablespoons') TEASPOON = Symbols(r'teaspoon') TEASPOONS = Symbols(r'teaspoons') QUART = Symbols(r'quart') QUARTS = Symbols(r'quarts') DASH = Symbols(r'dash') DASHES = Symbols(r'dashes') CUP = Symbols(r'cup') CUPS = Symbols(r'cups') METRICS = \ LBS & \ TABLESPOON & \ TABLESPOONS & \ TEASPOON & \ TEASPOONS & \ QUART & \ QUARTS & \ DASH & \ DASHES & \ CUP & \ CUPS IGNORE_CHARS = Ignore(r'[ \n,.-]+') IGNORES = IGNORE_CHARS # order is important, first token that matches will be considered TOKENS = FAT & QTYS & METRICS & IGNORES & INGREDIENT & RANDOM_WORD
food_recipe = """ 3 lbs chuck roast 1 quart water 1 1/2 quarts water 1 onion, chopped 1 stalk celery, chopped 2 tablespoons soy sauce 1/2 cup 2% Greek yogurt """ product = \ SymbolsParser(ONION) | \ SymbolsParser(ROAST) | \ SymbolsParser(SOY_SAUCE) | \ SymbolsParser(STALK_CELERY) | \ SymbolsParser(WATER) | \ SymbolsParser(YOGURT) measure = \ SymbolsParser(CUP) | \ SymbolsParser(CUPS) | \ SymbolsParser(LBS) | \ SymbolsParser(QUART) | \ SymbolsParser(QUARTS) | \ SymbolsParser(TABLESPOON) | \ SymbolsParser(TABLESPOONS) qty = \ SymbolsParser(QTY_HALF) | \ SymbolsParser(QTY) fat = SymbolsParser(FAT) ingredient = qty << ZeroOrMore(measure) << \ Repeat(fat | product | IgnoreResult(SymbolsParser(RANDOM_WORD))) recipe = ZeroOrMore(ingredient)
# extract tokens lexer = Lexer(TOKENS) tokens = lexer.parseTokens(food_recipe, False) print(tokens) parser = AllTokensConsumed(recipe) # generating AST ast = parser(tokens, 0) print("\nast:") ast.pretty_print() for group in ast: if group.check_parser(ingredient): print("\ningredient:") for sub_group in group: if sub_group.check_parser(qty): print("qty: %s" % sub_group.get_token()) if sub_group.check_parser(measure): print("measure: %s" % sub_group.get_token()) if sub_group.check_parser(product): print("product: %s" % sub_group.get_token()) if sub_group.check_parser(fat): print("fat level: %s" % sub_group.get_token())
output:
-------
[(3, \d+), (lbs, lbs), (chuck, [a-zA-Z]+), (roast, roast), (1, \d+), (quart, quart), (water, water), (1 1/2, (\d+ ){0,1}1/2), (quarts, quarts), (water, water), (1, \d+), (onion, onion), (chopped, [a-zA-Z]+), (1, \d+), (stalk celery, stalk celery), (chopped, [a-zA-Z]+), (2, \d+), (tablespoons, tablespoons), (soy sauce, soy sauce), (1/2, (\d+ ){0,1}1/2), (cup, cup), (2%, \d+%), (Greek, [a-zA-Z]+), (yogurt, yogurt)] ast: [[['3'], ['lbs'], ['roast']], [['1'], ['quart'], ['water']], [['1 1/2'], ['quarts'], ['water']], [['1'], ['onion']], [['1'], ['stalk celery']], [['2'], ['tablespoons'], ['soy sauce']], [['1/2'], ['cup'], ['2%'], ['yogurt']]] ingredient: qty: 3 measure: lbs product: roast ingredient: qty: 1 measure: quart product: water ingredient: qty: 1 1/2 measure: quarts product: water ingredient: qty: 1 product: onion ingredient: qty: 1 product: stalk celery ingredient: qty: 2 measure: tablespoons product: soy sauce ingredient: qty: 1/2 measure: cup fat level: 2% product: yogurt

Parse C source code from python and print function declarations/ function calls

Complete source code: pylangparser
Source code to be parsed:
source = r""" #include <stdio.h> struct struct_name { signed short int *p; char p; int * t; int p[5][5]; /* comment */ }; union union_name { signed short int *p; char p; int * t; int p[5][5]; /* comment */ }; typedef unsigned char BYTE; unsigned int p, c; enum some_name{p = 1, q}; char * func(int p, char t);
int func2(const int p, char t) { int l, q; char *f; unsigned short j; q = 5; func(12, func1(42), 42); printf ("hello world: %" GST_TIME_FORMAT, time); best = (GstMatroskaPad *) data; gst_ebml_write_set_cache (ebml, 0x40); if (mux->doctype_version > 1 && !write_duration) { if (is_video_keyframe) flags += 0x80; } if (!(a>b) && !c) f(12, a); { { q = 1; } q = 5; return f; } if (5 == 6) { } else { p = 1; } while (5 == 6) { p = 1; } do { p = 1; } while (5 == 6); for (;;) { p = 1; break; } for (i = 5; i < 5; i++) { p = 1; if (i == 4) { abort (1); break; } } switch (i) { case 5: { break; } default: { break; } } switch (i) { case 5: break; default: break; } if (p == 5) p = 5; else goto error; /* * this is a multi-line comment */ return (p == 5); error: { if (p == 5) p = 5.5; return 1; } } const gchar * gst_flow_get_name (GstFlowReturn ret) { gint i; ret = CLAMP (ret, GST_FLOW_CUSTOM_ERROR, GST_FLOW_CUSTOM_SUCCESS); ret = f(a, b); for (i = 0; i < G_N_ELEMENTS (flow_quarks); i++) { p = flow_quarks[i].ret; if (ret == flow_quarks[i].ret) return flow_quarks[i].name; } return "unknown"; } """
The source code:

 result = translation_unit(tokens, 0)

#
# print all function declarations # print("\n--------------function declarations--------------") for group in result: if group.check_parser(function_declaration): group.pretty_print() def perform_call_search(group): for sub_group in group: # perform deep search perform_call_search(sub_group) if sub_group.check_parser(call_expression): # current sub_group is a call expression, print it sub_group.pretty_print() func_name = sub_group.get_sub_group(1) # func name must fulfill SymbolsParser &amp;&amp; IDENTIFIER if not (func_name.check_parser_instance(SymbolsParser) and \ func_name.check_parser(IDENTIFIER)): raise TypeError("internal error, func_name not IDENTIFIER") func_name_token = func_name.get_token() print("func name: %s" % func_name_token) func_args = sub_group.get_sub_group(2) if not func_args.check_parser(arglist): raise TypeError("internal error, func_args not arglist") for arg in func_args: print("arg: %s" % arg) # # print all function calls within each function # print("\n--------------function calls--------------") for group in result: if group.check_parser(function_definition): print("\nfound function definition, all function calls within " \ "its body:") perform_call_search(group)
Output after running the example script in the package:

--------------function declarations--------------
[['char'], [['*'], [['func'], [[['int'], ['p']], [['char'], ['t']]]]]]

--------------function calls--------------

found function definition, all function calls within its body:
[['func1'], ['42']]
func name: func1
[['func'], [['12'], [[['func1'], ['42']], ['42']]]]
func name: func
arg: (12, instance: (0x[0-9A-Fa-f]*|\d+))
arg: (((func1, instance: [A-Za-z_]+[A-Za-z0-9_]*), (42, instance: (0x[0-9A-Fa-f]*|\d+))), (42, instance: (0x[0-9A-Fa-f]*|\d+)))
[['printf'], [[['"hello world: %"'], ['GST_TIME_FORMAT']], ['time']]]
func name: printf
arg: (("hello world: %", instance: \".*\"), (GST_TIME_FORMAT, instance: [A-Za-z_]+[A-Za-z0-9_]*))
arg: (time, instance: [A-Za-z_]+[A-Za-z0-9_]*)
[['gst_ebml_write_set_cache'], [['ebml'], ['0x40']]]
func name: gst_ebml_write_set_cache
arg: (ebml, instance: [A-Za-z_]+[A-Za-z0-9_]*)
arg: (0x40, instance: (0x[0-9A-Fa-f]*|\d+))
[['f'], [['12'], ['a']]]
func name: f
arg: (12, instance: (0x[0-9A-Fa-f]*|\d+))
arg: (a, instance: [A-Za-z_]+[A-Za-z0-9_]*)
[['abort'], ['1']]
func name: abort

found function definition, all function calls within its body:
[['CLAMP'],
 [['ret'], [['GST_FLOW_CUSTOM_ERROR'], ['GST_FLOW_CUSTOM_SUCCESS']]]]
func name: CLAMP
arg: (ret, instance: [A-Za-z_]+[A-Za-z0-9_]*)
arg: ((GST_FLOW_CUSTOM_ERROR, instance: [A-Za-z_]+[A-Za-z0-9_]*), (GST_FLOW_CUSTOM_SUCCESS, instance: [A-Za-z_]+[A-Za-z0-9_]*))
[['f'], [['a'], ['b']]]
func name: f
arg: (a, instance: [A-Za-z_]+[A-Za-z0-9_]*)
arg: (b, instance: [A-Za-z_]+[A-Za-z0-9_]*)