Ognyan Tonchev

Sunday, June 21, 2015

Parse food recipes from Python with pylangparser


from pylangparser import *

QTY = Symbols(r'\d+')
QTY_HALF = Symbols(r'(\d+ ){0,1}1/2')
QTYS = QTY_HALF & QTY

ONION = Symbols(r'onion')
ROAST = Symbols(r'roast')
SOY_SAUCE = Symbols(r'soy sauce')
STALK_CELERY = Symbols(r'stalk celery')
WATER = Symbols(r'water')
YOGURT = Symbols(r'yogurt')

FAT = Symbols(r'\d+%')

INGREDIENT = \
    ONION & \
    ROAST & \
    SOY_SAUCE & \
    STALK_CELERY & \
    WATER & \
    YOGURT

RANDOM_WORD = Symbols(r'[a-zA-Z]+')



LBS = Symbols(r'lbs')
TABLESPOON = Symbols(r'tablespoon')
TABLESPOONS = Symbols(r'tablespoons')
TEASPOON = Symbols(r'teaspoon')
TEASPOONS = Symbols(r'teaspoons')
QUART = Symbols(r'quart')
QUARTS = Symbols(r'quarts')
DASH = Symbols(r'dash')
DASHES = Symbols(r'dashes')
CUP = Symbols(r'cup')
CUPS = Symbols(r'cups')

METRICS = \
    LBS & \
    TABLESPOON & \
    TABLESPOONS & \
    TEASPOON & \
    TEASPOONS & \
    QUART & \
    QUARTS & \
    DASH & \
    DASHES & \
    CUP & \
    CUPS

IGNORE_CHARS = Ignore(r'[ \n,.-]+')

IGNORES = IGNORE_CHARS

# order is important, first token that matches will be considered
TOKENS = FAT & QTYS & METRICS & IGNORES & INGREDIENT & RANDOM_WORD



food_recipe = """

3 lbs chuck roast
1 quart water
1 1/2 quarts water
1 onion, chopped
1 stalk celery, chopped
2 tablespoons soy sauce
1/2 cup 2% Greek yogurt

"""

product = \
    SymbolsParser(ONION) | \
    SymbolsParser(ROAST) | \
    SymbolsParser(SOY_SAUCE) | \
    SymbolsParser(STALK_CELERY) | \
    SymbolsParser(WATER) | \
    SymbolsParser(YOGURT)

measure = \
    SymbolsParser(CUP) | \
    SymbolsParser(CUPS) | \
    SymbolsParser(LBS) | \
    SymbolsParser(QUART) | \
    SymbolsParser(QUARTS) | \
    SymbolsParser(TABLESPOON) | \
    SymbolsParser(TABLESPOONS)

qty = \
    SymbolsParser(QTY_HALF) | \
    SymbolsParser(QTY)

fat = SymbolsParser(FAT)

ingredient = qty << ZeroOrMore(measure) << \
    Repeat(fat | product | IgnoreResult(SymbolsParser(RANDOM_WORD)))

recipe = ZeroOrMore(ingredient)



# extract tokens
lexer = Lexer(TOKENS)
tokens = lexer.parseTokens(food_recipe, False)
print(tokens)

parser = AllTokensConsumed(recipe)

# generating AST
ast = parser(tokens, 0)
print("\nast:")
ast.pretty_print()

for group in ast:
    if group.check_parser(ingredient):
        print("\ningredient:")
        for sub_group in group:
            if sub_group.check_parser(qty):
                print("qty: %s" % sub_group.get_token())
            if sub_group.check_parser(measure):
                print("measure: %s" % sub_group.get_token())
            if sub_group.check_parser(product):
                print("product: %s" % sub_group.get_token())
            if sub_group.check_parser(fat):
                print("fat level: %s" % sub_group.get_token())







output:

-------



[(3, \d+), (lbs, lbs), (chuck, [a-zA-Z]+), (roast, roast), (1, \d+), (quart, quart), (water, water), (1 1/2, (\d+ ){0,1}1/2), (quarts, quarts), (water, water), (1, \d+), (onion, onion), (chopped, [a-zA-Z]+), (1, \d+), (stalk celery, stalk celery), (chopped, [a-zA-Z]+), (2, \d+), (tablespoons, tablespoons), (soy sauce, soy sauce), (1/2, (\d+ ){0,1}1/2), (cup, cup), (2%, \d+%), (Greek, [a-zA-Z]+), (yogurt, yogurt)]

ast:
[[['3'], ['lbs'], ['roast']],
 [['1'], ['quart'], ['water']],
 [['1 1/2'], ['quarts'], ['water']],
 [['1'], ['onion']],
 [['1'], ['stalk celery']],
 [['2'], ['tablespoons'], ['soy sauce']],
 [['1/2'], ['cup'], ['2%'], ['yogurt']]]

ingredient:
qty: 3
measure: lbs
product: roast

ingredient:
qty: 1
measure: quart
product: water

ingredient:
qty: 1 1/2
measure: quarts
product: water

ingredient:
qty: 1
product: onion

ingredient:
qty: 1
product: stalk celery

ingredient:
qty: 2
measure: tablespoons
product: soy sauce

ingredient:
qty: 1/2
measure: cup
fat level: 2%
product: yogurt

Parse C source code from python and print function declarations/ function calls


Complete source code: pylangparser



Source code to be parsed:

source = r"""

#include <stdio.h>

struct struct_name {
  signed short int *p;
  char p;
  int * t;
  int p[5][5];
  /* comment */
};

union union_name {
  signed short int *p;
  char p;
  int * t;
  int p[5][5];
  /* comment */
};

typedef unsigned char BYTE;

unsigned int p, c;

enum some_name{p = 1, q};

char * func(int p, char t);



int
func2(const int p, char t) {
  int l, q;
  char *f;
  unsigned short j;

  q = 5;
  func(12, func1(42), 42);

  printf ("hello world: %" GST_TIME_FORMAT, time);

  best = (GstMatroskaPad *) data;

  gst_ebml_write_set_cache (ebml, 0x40);

  if (mux->doctype_version > 1 && !write_duration) {
    if (is_video_keyframe)
      flags += 0x80;
  }
  if (!(a>b) && !c)
    f(12, a);

  {
    {
      q = 1;
    }
    q = 5;
    return f;
  }

  if (5 == 6) {
  } else {
    p = 1;
  }

  while (5 == 6) {
    p = 1;
  }

  do {
    p = 1;
  } while (5 == 6);

  for (;;) {
    p = 1;
    break;
  }

  for (i = 5; i < 5; i++) {
    p = 1;
    if (i == 4) {
      abort (1);
      break;
    }
  }

  switch (i) {
    case 5: {
      break;
    }
    default: {
      break;
    }
  }

  switch (i) {
    case 5:
      break;
    default:
      break;
  }

  if (p == 5)
    p = 5;
  else
    goto error;

  /*
   * this is a multi-line comment
   */

  return (p == 5);

error: {
  if (p == 5)
    p = 5.5;
  return 1;
}
}

const gchar *
gst_flow_get_name (GstFlowReturn ret)
{
  gint i;

  ret = CLAMP (ret, GST_FLOW_CUSTOM_ERROR, GST_FLOW_CUSTOM_SUCCESS);

  ret = f(a, b);

  for (i = 0; i < G_N_ELEMENTS (flow_quarks); i++) {
    p = flow_quarks[i].ret;
    if (ret == flow_quarks[i].ret)
      return flow_quarks[i].name;
  }
  return "unknown";
}

"""



The source code:

 result = translation_unit(tokens, 0)


 #

 # print all function declarations  
 #  
 print("\n--------------function declarations--------------")  
 for group in result:  
   if group.check_parser(function_declaration):  
     group.pretty_print()  
 def perform_call_search(group):  
   for sub_group in group:  
     # perform deep search  
     perform_call_search(sub_group)  
     if sub_group.check_parser(call_expression):  
       # current sub_group is a call expression, print it  
       sub_group.pretty_print()  
       func_name = sub_group.get_sub_group(1)  
       # func name must fulfill SymbolsParser &amp;&amp; IDENTIFIER  
       if not (func_name.check_parser_instance(SymbolsParser) and \  
         func_name.check_parser(IDENTIFIER)):  
         raise TypeError("internal error, func_name not IDENTIFIER")  
       func_name_token = func_name.get_token()  
       print("func name: %s" % func_name_token)  
       func_args = sub_group.get_sub_group(2)  
       if not func_args.check_parser(arglist):  
         raise TypeError("internal error, func_args not arglist")  
       for arg in func_args:  
         print("arg: %s" % arg)  
 #  
 # print all function calls within each function  
 #  
 print("\n--------------function calls--------------")  
 for group in result:  
   if group.check_parser(function_definition):  
     print("\nfound function definition, all function calls within " \  
        "its body:")  
     perform_call_search(group)

Output after running the example script in the package:

--------------function declarations--------------
[['char'], [['*'], [['func'], [[['int'], ['p']], [['char'], ['t']]]]]]

--------------function calls--------------

found function definition, all function calls within its body:
[['func1'], ['42']]
func name: func1
[['func'], [['12'], [[['func1'], ['42']], ['42']]]]
func name: func
arg: (12, instance: (0x[0-9A-Fa-f]*|\d+))
arg: (((func1, instance: [A-Za-z_]+[A-Za-z0-9_]*), (42, instance: (0x[0-9A-Fa-f]*|\d+))), (42, instance: (0x[0-9A-Fa-f]*|\d+)))
[['printf'], [[['"hello world: %"'], ['GST_TIME_FORMAT']], ['time']]]
func name: printf
arg: (("hello world: %", instance: \".*\"), (GST_TIME_FORMAT, instance: [A-Za-z_]+[A-Za-z0-9_]*))
arg: (time, instance: [A-Za-z_]+[A-Za-z0-9_]*)
[['gst_ebml_write_set_cache'], [['ebml'], ['0x40']]]
func name: gst_ebml_write_set_cache
arg: (ebml, instance: [A-Za-z_]+[A-Za-z0-9_]*)
arg: (0x40, instance: (0x[0-9A-Fa-f]*|\d+))
[['f'], [['12'], ['a']]]
func name: f
arg: (12, instance: (0x[0-9A-Fa-f]*|\d+))
arg: (a, instance: [A-Za-z_]+[A-Za-z0-9_]*)
[['abort'], ['1']]
func name: abort

found function definition, all function calls within its body:
[['CLAMP'],
 [['ret'], [['GST_FLOW_CUSTOM_ERROR'], ['GST_FLOW_CUSTOM_SUCCESS']]]]
func name: CLAMP
arg: (ret, instance: [A-Za-z_]+[A-Za-z0-9_]*)
arg: ((GST_FLOW_CUSTOM_ERROR, instance: [A-Za-z_]+[A-Za-z0-9_]*), (GST_FLOW_CUSTOM_SUCCESS, instance: [A-Za-z_]+[A-Za-z0-9_]*))
[['f'], [['a'], ['b']]]
func name: f
arg: (a, instance: [A-Za-z_]+[A-Za-z0-9_]*)
arg: (b, instance: [A-Za-z_]+[A-Za-z0-9_]*)

Wednesday, October 22, 2014

Parse GTK-Doc style comments from Python

A new parser has been added to pylangparser's examples section for parsing GTK-Doc style comments and annotations. Check it out:

PyLangParser

Sunday, October 12, 2014

Parse SQL scripts from Python

A new parser has been added to pylangparser's examples section for parsing SQL scripts. Check it out:

PyLangParser

Wednesday, October 1, 2014

Parsing C code from Python with PyLangParser

Recently I have been working on a Python tool for static analysis of source code, for programs based on GLib/GStreamer, which takes advantage of the GObject-Introspection annotations built on top of GTK-Doc comment blocks.
The tool discovers potential memory leaks. It is in an early, still unstable phase but I hope that I will manage to make it available sometime in the near future.

It is based on a small Python tool that I wrote a while ago for parsing formal languages. The tool is called pylangparser and is fairly simple to use and does not depend on any external libraries. Grammars are defined inside the source code file. There are also some example scripts which show how it can be used for parsing and analyzing C source code. You may want to take a look at it, it is available on GitHub:

PyLangParser

Sunday, June 29, 2014

GStreamer based RTSP viewer for Android

Source code is available here: RTSP Viewer for Android

Friday, May 9, 2014

BTHBookStore - Electronic book store implemented in Java and eHTML using the Model–view–controller (MVC) software architectural pattern.

E-Commerce BTHBookStore

Electronic book store implemented in Java/Servlets using the Model–view–controller (MVC) software architectural pattern. The user interface is implemented using eHTML, a simple scripting language which is part of the project.

The source code is available under the Distributed Creative Commons Attribution-Share Alike 2.5 Generic license (http://creativecommons.org/licenses/by-sa/2.5/) and can be downloaded here: http://sourceforge.net/projects/bthebookstore/

The admin interface:

The store itself:

otonchev@gmail.com