1

I'm working with Graphite monitoring using Carbon and Ceres as the storage method. I have some problems with correcting bad data. It seems that (due to various problems) I've ended up with overlapping files. That is, since Carbon / Ceres stores the data as timestamp@interval.slice, I can have two or more files with overlapping time ranges.

There are two kinds of overlaps:

File A:  +------------+        orig file
File B:      +-----+           subset
File C:          +---------+   overlap

This is causing problems because the existing tools available (ceres-maintenance defrag and rollup) don't cope with these overlaps. Instead, they skip the directory and move on. This is a problem, obviously.

Machavity
  • 30,841
  • 27
  • 92
  • 100
Kevin J. Rice
  • 3,337
  • 2
  • 24
  • 23

1 Answers1

0

I've created a script that fixes this problem, as follows:

  1. For subsets, just delete the subset file.

  2. For overlaps, using the file system 'truncate' on the orig file at the point where the next file starts. While it is possible to cut off the start of the overlap file and rename it properly, I would suggest that this is fraught with danger.

I've found that it's possible to do this in two ways:

  1. Walk the dirs and iterate over the files, fixing as you go, and find the file subsets, remove them;

  2. Walk the dirs and fix all the problems in a dir before moving on. This is BY FAR the faster approach, since the dir walk is hugely time consuming.

Code:

#!/usr/bin/env python2.6
################################################################################

import io
import os
import time
import sys
import string
import logging
import unittest
import datetime
import random
import zmq
import json
import socket
import traceback
import signal
import select
import simplejson
import cPickle as pickle
import re
import shutil
import collections
from pymongo import Connection
from optparse import OptionParser
from pprint import pprint, pformat

################################################################################

class SliceFile(object):
    def __init__(self, fname):
        self.name       = fname
        basename        = fname.split('/')[-1]
        fnArray         = basename.split('@')
        self.timeStart  = int(fnArray[0])
        self.freq       = int(fnArray[1].split('.')[0])
        self.size       = None
        self.numPoints  = None
        self.timeEnd    = None
        self.deleted    = False

    def __repr__(self):
        out = "Name: %s, tstart=%s tEnd=%s, freq=%s, size=%s, npoints=%s." % (
            self.name, self.timeStart, self.timeEnd, self.freq, self.size, self.numPoints)
        return out

    def setVars(self):
        self.size       = os.path.getsize(self.name)
        self.numPoints  = int(self.size / 8)
        self.timeEnd    = self.timeStart + (self.numPoints * self.freq)

################################################################################

class CeresOverlapFixup(object):

    def __del__(self):
        import datetime
        self.writeLog("Ending at %s" % (str(datetime.datetime.today())))
        self.LOGFILE.flush()
        self.LOGFILE.close()

    def __init__(self):
        self.verbose            = False
        self.debug              = False
        self.LOGFILE            = open("ceresOverlapFixup.log", "a")
        self.badFilesList       = set()
        self.truncated          = 0
        self.subsets            = 0
        self.dirsExamined       = 0            
        self.lastStatusTime     = 0

    def getOptionParser(self):
        return OptionParser()

    def getOptions(self):
        parser = self.getOptionParser()
        parser.add_option("-d", "--debug",      action="store_true",                 dest="debug",   default=False, help="debug mode for this program, writes debug messages to logfile." )
        parser.add_option("-v", "--verbose",    action="store_true",                 dest="verbose", default=False, help="verbose mode for this program, prints a lot to stdout." )
        parser.add_option("-b", "--basedir",    action="store",      type="string",  dest="basedir", default=None,  help="base directory location to start converting." )
        (options, args)     = parser.parse_args()
        self.debug          = options.debug
        self.verbose        = options.verbose
        self.basedir        = options.basedir
        assert self.basedir, "must provide base directory."

    # Examples:
    # ./updateOperations/1346805360@60.slice
    # ./updateOperations/1349556660@60.slice
    # ./updateOperations/1346798040@60.slice

    def getFileData(self, inFilename):
        ret = SliceFile(inFilename)
        ret.setVars()
        return ret

    def removeFile(self, inFilename):
        os.remove(inFilename)
        #self.writeLog("removing file: %s" % (inFilename))
        self.subsets += 1

    def truncateFile(self, fname, newSize):
        if self.verbose:
            self.writeLog("Truncating file, name=%s, newsize=%s" % (pformat(fname), pformat(newSize)))
        IFD = None
        try:
            IFD = os.open(fname, os.O_RDWR|os.O_CREAT)
            os.ftruncate(IFD, newSize)
            os.close(IFD)
            self.truncated += 1
        except:
            self.writeLog("Exception during truncate: %s" % (traceback.format_exc()))
        try:
            os.close(IFD)
        except:
            pass
        return

    def printStatus(self):
        now = self.getNowTime()
        if ((now - self.lastStatusTime) > 10):
            self.writeLog("Status: time=%d, Walked %s dirs, subsetFilesRemoved=%s, truncated %s files." % (now, self.dirsExamined, self.subsets, self.truncated))
            self.lastStatusTime = now

    def fixupThisDir(self, inPath, inFiles):

        # self.writeLog("Fixing files in dir: %s" % (inPath))
        if not '.ceres-node' in inFiles:
            # self.writeLog("--> Not a slice directory, skipping.")
            return

        self.dirsExamined += 1            

        sortedFiles = sorted(inFiles)
        sortedFiles = [x for x in sortedFiles if ((x != '.ceres-node') and (x.count('@') > 0)) ]
        lastFile    = None
        fileObjList = []
        for thisFile in sortedFiles:
            wholeFilename = os.path.join(inPath, thisFile)
            try:
                curFile = self.getFileData(wholeFilename)
                fileObjList.append(curFile)
            except:
                self.badFilesList.add(wholeFilename)
                self.writeLog("ERROR: file %s, %s" % (wholeFilename, traceback.format_exc()))

        # name is timeStart, really.
        fileObjList = sorted(fileObjList, key=lambda thisObj: thisObj.name)

        while fileObjList:

            self.printStatus()

            changes = False
            firstFile = fileObjList[0]
            removedFiles = []
            for curFile in fileObjList[1:]:
                if (curFile.timeEnd <= firstFile.timeEnd):
                    # have subset file. elim.
                    self.removeFile(curFile.name)
                    removedFiles.append(curFile.name)
                    self.subsets += 1
                    changes = True
                    if self.verbose:
                        self.writeLog("Subset file situation.  First=%s, overlap=%s" % (firstFile, curFile))
            fileObjList = [x for x in fileObjList if x.name not in removedFiles]
            if (len(fileObjList) < 2):
                break
            secondFile = fileObjList[1]

            # LT is right.  FirstFile's timeEnd is always the first open time after first is done.
            # so, first starts@100, len=2, end=102, positions used=100,101. second start@102 == OK.
            if (secondFile.timeStart < firstFile.timeEnd):
                # truncate first file.
                # file_A (last):    +---------+
                # file_B (curr):         +----------+ 
                # solve by truncating previous file at startpoint of current file.
                newLenFile_A_seconds = int(secondFile.timeStart - firstFile.timeStart)
                newFile_A_datapoints = int(newLenFile_A_seconds / firstFile.freq)
                newFile_A_bytes      = int(newFile_A_datapoints) * 8
                if (not newFile_A_bytes):
                    fileObjList = fileObjList[1:]
                    continue
                assert newFile_A_bytes, "Must have size.  newLenFile_A_seconds=%s, newFile_A_datapoints=%s, newFile_A_bytes=%s." % (newLenFile_A_seconds, newFile_A_datapoints, newFile_A_bytes)
                self.truncateFile(firstFile.name, newFile_A_bytes)
                if self.verbose:
                    self.writeLog("Truncate situation.  First=%s, overlap=%s" % (firstFile, secondFile))
                self.truncated += 1
                fileObjList = fileObjList[1:]
                changes = True

            if not changes:
                fileObjList = fileObjList[1:]


    def getNowTime(self):
        return time.time()


    def walkDirStructure(self):

        startTime           = self.getNowTime()
        self.lastStatusTime = startTime
        updateStatsDict     = {}
        self.okayFiles      = 0
        emptyFiles          = 0 

        for (thisPath, theseDirs, theseFiles) in os.walk(self.basedir):
            self.printStatus()
            self.fixupThisDir(thisPath, theseFiles)
            self.dirsExamined += 1

        endTime = time.time()
        # time.sleep(11)
        self.printStatus()
        self.writeLog( "now = %s, started at %s, elapsed time = %s seconds." % (startTime, endTime, endTime - startTime))
        self.writeLog( "Done.")


    def writeLog(self, instring):
        print instring
        print >> self.LOGFILE, instring
        self.LOGFILE.flush()

    def main(self):
        self.getOptions()
        self.walkDirStructure()
Kevin J. Rice
  • 3,337
  • 2
  • 24
  • 23
  • Yep. Could be cleaned up, agree. But, whatchagonnado? I could spend a long time making this tighter. If you have a better version, please post it. – Kevin J. Rice Jun 11 '14 at 19:04