Text Parsing

Python is an excellent tool for scanning and manipulating textual data.

Example: Strip token from certain files in a folder


import os

class Info(object):
    def __init__(self, ID=None):
        self.ID = ID
    def to_dict(self):
        return {
            'ID': self.ID,
        }

def ParseFile (filepath, keyword, stopword, token):
    with open(filepath) as oFile:
        for line in oFile:
            pos = line.find(keyword)
            if (pos >= 0):
                startPos = pos + len(keyword)
                pos = line.find(stopword, startPos)
                if (pos >= 0):
                    endPos = pos
                    token = line[startPos:endPos]
                    my_list.append (Info(token))

my_list = []
token = ''
rootdir = "C:\Log"

for subdir, dirs, files in os.walk(rootdir):
    for file in files:
        if file.startswith("tfs_"):
            filepath = subdir + os.sep + file
            #print(filepath)
            ParseFile (filepath, "/sitecore/", ".item", token)


import pandas as pd

my_df = pd.DataFrame.from_records([dr.to_dict() for dr in my_list])
my_df.to_csv('out.csv', index=False, header=False)