From 906f8fa0a9749302dc11b55e2e01dde88c5afcf6 Mon Sep 17 00:00:00 2001 From: Harald Alvestrand Date: Tue, 30 Aug 2016 12:36:10 +0200 Subject: [PATCH 1/2] Refactoring the disk cache to have directory as an init parameter Running calculations of scores & best scores over several directories turned out to be complex if directory was passed as a call parameter. This refactoring makes it part of the init parameters of the optimizer, the context and the cache. This means that one has to have functions for "adopting" an encoding into the other context. Fixes #77 --- bin/take_snapshot | 3 +++ bin/verify_scores | 24 +++++++++++------- lib/encoder.py | 52 ++++++++++++++++++++------------------- lib/encoder_unittest.py | 28 ++++++++++++--------- lib/optimizer.py | 25 +++++++++++++------ lib/optimizer_unittest.py | 31 ++++++++++++++++++++--- 6 files changed, 107 insertions(+), 56 deletions(-) diff --git a/bin/take_snapshot b/bin/take_snapshot index 20d9c64..9267a5d 100755 --- a/bin/take_snapshot +++ b/bin/take_snapshot @@ -26,6 +26,9 @@ if [ -z $CODEC_WORKDIR -o ! -d $CODEC_WORKDIR ]; then fi DESTINATION=$WORKDIR/snapshot +echo "Removing old snapshot" +rm -r $DESTINATION +echo "Creating new snapshot" for RESULT in $(find $CODEC_WORKDIR -name '*.result' -o -name 'parameters'); do DEST_FILE=$(echo $RESULT | sed -e "s!$CODEC_WORKDIR!$DESTINATION!") DEST_DIR=$(dirname $DEST_FILE) diff --git a/bin/verify_scores b/bin/verify_scores index 204ceff..50e8086 100755 --- a/bin/verify_scores +++ b/bin/verify_scores @@ -45,20 +45,24 @@ def ClassifyScoreRelation(old_score, new_score): return 'improved' return 'worsened' -def VerifyOneTarget(codec_names, rate, videofile, score): +def VerifyOneTarget(codec_names, rate, videofile, old_scores, score): change_counts = collections.Counter() for codec_name in codec_names: codec = pick_codec.PickCodec(codec_name) - my_optimizer = optimizer.Optimizer(codec) - bestsofar = my_optimizer.BestEncoding(rate, videofile) + old_optimizer = optimizer.Optimizer(codec, scoredir=old_scores) + new_optimizer = optimizer.Optimizer(codec) + bestsofar = new_optimizer.RebaseEncoding( + old_optimizer.BestEncoding(rate, videofile)) + bestsofar.Recover() if score: bestsofar.Execute().Store() try: - old_score = my_optimizer.Score(bestsofar, scoredir='snapshot') + old_score = old_optimizer.Score(bestsofar) except encoder.Error: - print 'No old score for %s, continuing' % (bestsofar.encoder.Hashname()) + print 'No old score for %s %d %s, continuing' % \ + (bestsofar.encoder.Hashname(), rate, videofile.filename) continue - new_score = my_optimizer.Score(bestsofar) + new_score = new_optimizer.Score(bestsofar) result = ClassifyScoreRelation(old_score, new_score) change_counts[result] += 1 if result != 'no change': @@ -70,12 +74,12 @@ def VerifyOneTarget(codec_names, rate, videofile, score): return change_counts -def VerifyResults(codec_names, score): +def VerifyResults(codec_names, old_scores, score): change_counts = collections.Counter() for rate, filename in mpeg_settings.MpegFiles().AllFilesAndRates(): videofile = encoder.Videofile(filename) change_counts.update(VerifyOneTarget(codec_names, rate, - videofile, score)) + videofile, old_scores, score)) print 'Result so far:', dict(change_counts) return change_counts @@ -84,8 +88,10 @@ def main(): parser.add_argument('codec_names', nargs='*', default=pick_codec.AllCodecNames()) parser.add_argument('--score', action='store_true', default=False) + parser.add_argument('--old_scores', default='snapshot') args = parser.parse_args() - change_count = VerifyResults(args.codec_names, score=args.score) + change_count = VerifyResults(args.codec_names, old_scores=args.old_scores, + score=args.score) print 'Change evaluations: ', dict(change_count) return 0 diff --git a/lib/encoder.py b/lib/encoder.py index 9896a9e..2339e44 100644 --- a/lib/encoder.py +++ b/lib/encoder.py @@ -494,12 +494,12 @@ class Context(object): is passed to the context constructor. """ - def __init__(self, codec, cache_class=None): + def __init__(self, codec, cache_class=None, scoredir=None): self.codec = codec if cache_class: - self.cache = cache_class(self) + self.cache = cache_class(self, scoredir) else: - self.cache = EncodingMemoryCache(self) + self.cache = EncodingMemoryCache(self, scoredir) class Encoder(object): @@ -731,11 +731,15 @@ def _FileNameToVideofile(full_filename): class EncodingDiskCache(object): """Encoder and encoding information, saved to disk.""" - def __init__(self, context): + def __init__(self, context, scoredir=None): self.context = context - # Default work directory is current directory. - self.workdir = '%s/%s' % (encoder_configuration.conf.workdir(), - context.codec.name) + if scoredir: + self.workdir = os.path.join(encoder_configuration.conf.sysdir(), + scoredir, context.codec.name) + else: + # Default work directory. + self.workdir = os.path.join(encoder_configuration.conf.workdir(), + context.codec.name) if not os.path.isdir(self.workdir): os.mkdir(self.workdir) @@ -766,7 +770,11 @@ def _FilesToEncodings(self, files, videofile, bitrate=None, encoder or self._FileNameToEncoder(full_filename), bitrate or _FileNameToBitrate(full_filename), videofile or _FileNameToVideofile(full_filename)) - candidate.Recover() + try: + candidate.Recover() + except Error: + print 'Ignoring error in file %s' % full_filename + continue candidates.append(candidate) return candidates @@ -848,22 +856,16 @@ def AllEncoderFilenames(self, only_workdir=False): def RemoveEncoder(self, hashname): shutil.rmtree(os.path.join(self.workdir, hashname)) - def StoreEncoding(self, encoding, workdir=None): + def StoreEncoding(self, encoding): """Stores an encoding object on disk. An encoding object consists of its result (if executed). The bitrate is encoded as a directory, the videofilename is encoded as part of the output filename. """ - if workdir: - dirname = os.path.join(workdir, - self.context.codec.name, - encoding.encoder.Hashname(), - self.context.codec.SpeedGroup(encoding.bitrate)) - else: - dirname = os.path.join(self.workdir, - encoding.encoder.Hashname(), - self.context.codec.SpeedGroup(encoding.bitrate)) + dirname = os.path.join(self.workdir, + encoding.encoder.Hashname(), + self.context.codec.SpeedGroup(encoding.bitrate)) if not os.path.isdir(dirname): os.mkdir(dirname) if not encoding.result: @@ -872,17 +874,14 @@ def StoreEncoding(self, encoding, workdir=None): with open('%s/%s.result' % (dirname, videoname), 'w') as resultfile: json.dump(encoding.result, resultfile, indent=2) - def ReadEncodingResult(self, encoding, scoredir=None): + def ReadEncodingResult(self, encoding): """Reads an encoding result back from storage, if present. None is returned if file does not exist. If scoredir is given, only that directory is searched. If it is not given, all directories in the search path are searched.""" - if scoredir: - searchpath = [os.path.join(scoredir, self.context.codec.name)] - else: - searchpath = self.SearchPathForScores() + searchpath = self.SearchPathForScores() for workdir in searchpath: dirname = os.path.join(workdir, encoding.encoder.Hashname(), @@ -903,13 +902,16 @@ def ReadEncodingResult(self, encoding, scoredir=None): class EncodingMemoryCache(object): """Encoder and encoding information, in-memory only. For testing.""" - def __init__(self, context): + def __init__(self, context, scoredir=None): + if scoredir: + raise Error('Cannot have scoredir on a memory cache') self.context = context self.encoders = {} self.encodings = [] + self.workdir = '/not-valid-file/' + self.context.codec.name def WorkDir(self): - return '/not-valid-file/' + self.context.codec.name + return self.workdir def AllScoredEncodings(self, bitrate, videofile): result = [] diff --git a/lib/encoder_unittest.py b/lib/encoder_unittest.py index a932f42..170ab87 100755 --- a/lib/encoder_unittest.py +++ b/lib/encoder_unittest.py @@ -510,8 +510,11 @@ def testRemoveEncoder(self): def testReadResultFromAlternateDir(self): context = StorageOnlyContext() - otherdir = os.path.join(encoder_configuration.conf.sysdir(), 'otherdir') + otherdir_path = os.path.join(encoder_configuration.conf.sysdir(), + 'otherdir') + os.mkdir(otherdir_path) cache = encoder.EncodingDiskCache(context) + other_cache = encoder.EncodingDiskCache(context, scoredir='otherdir') my_encoder = encoder.Encoder( context, encoder.OptionValueSet(encoder.OptionSet(), '--parameters')) @@ -523,10 +526,11 @@ def testReadResultFromAlternateDir(self): my_encoding.result = testresult cache.StoreEncoding(my_encoding) my_encoding.result = None - result = cache.ReadEncodingResult(my_encoding, scoredir=otherdir) + result = other_cache.ReadEncodingResult(my_encoding) self.assertIsNone(result) - shutil.copytree(encoder_configuration.conf.workdir(), otherdir) - result = cache.ReadEncodingResult(my_encoding, scoredir=otherdir) + shutil.rmtree(otherdir_path) + shutil.copytree(encoder_configuration.conf.workdir(), otherdir_path) + result = other_cache.ReadEncodingResult(my_encoding) self.assertEquals(result, testresult) def testAllScoredEncodingsForEncoder(self): @@ -587,9 +591,10 @@ def testEncodersInMultipleRepos(self): encoder.OptionValueSet(encoder.OptionSet(), '--parameters')) other_dir = os.path.join(encoder_configuration.conf.sysdir(), 'multirepo_test') - encoder_configuration.conf.override_scorepath_for_test([other_dir]) os.mkdir(other_dir) - cache.StoreEncoder(my_encoder, workdir=other_dir) + other_cache = encoder.EncodingDiskCache(context, scoredir='multirepo_test') + encoder_configuration.conf.override_scorepath_for_test([other_dir]) + other_cache.StoreEncoder(my_encoder) encoders = cache.AllEncoderFilenames(only_workdir=True) self.assertEquals(0, len(encoders)) encoders = cache.AllEncoderFilenames(only_workdir=False) @@ -601,13 +606,14 @@ def testEncodersInMultipleRepos(self): encoder.Videofile('x/foo_640_480_20.yuv')) testresult = {'foo': 'bar'} my_encoding.result = testresult - cache.StoreEncoding(my_encoding, workdir=other_dir) + other_cache.StoreEncoding(my_encoding) # With a specified directory, we should find it in only one place. - self.assertFalse(cache.ReadEncodingResult(my_encoding, - scoredir=encoder_configuration.conf.workdir())) - self.assertTrue(cache.ReadEncodingResult(my_encoding, scoredir=other_dir)) - # Without a specified directory, we should find it everywhere. + self.assertTrue(other_cache.ReadEncodingResult(my_encoding)) + # Without a specified directory, we should find it on the searchpath. self.assertTrue(cache.ReadEncodingResult(my_encoding)) + # Without a searchpath, we shouldn't find it in the default cache. + encoder_configuration.conf.override_scorepath_for_test([]) + self.assertFalse(cache.ReadEncodingResult(my_encoding)) class TestEncodingMemoryCache(unittest.TestCase): diff --git a/lib/optimizer.py b/lib/optimizer.py index 5e665ae..66977f6 100644 --- a/lib/optimizer.py +++ b/lib/optimizer.py @@ -30,22 +30,22 @@ class Optimizer(object): - A video file set, with associated target bitrates. - A set of pre-executed encodings (the cache). - A score function. + - A score directory, normally null, which means "take from context". One should be able ask an optimizer to find the parameters that give the best result on the score function for that codec.""" def __init__(self, codec, file_set=None, - cache_class=None, score_function=None): + cache_class=None, score_function=None, + scoredir=None): + # pylint: disable=too-many-arguments self.context = encoder.Context(codec, - cache_class or encoder.EncodingDiskCache) + cache_class or encoder.EncodingDiskCache, + scoredir=scoredir) self.file_set = file_set self.score_function = score_function or score_tools.ScorePsnrBitrate - def Score(self, encoding, scoredir=None): - if scoredir is None: - result = encoding.result - else: - result = self.context.cache.ReadEncodingResult(encoding, - scoredir=scoredir) + def Score(self, encoding): + result = encoding.result if not result: raise encoder.Error('Trying to score an encoding without result') score = self.score_function(encoding.bitrate, result) @@ -53,6 +53,15 @@ def Score(self, encoding, scoredir=None): score -= len(encoding.encoder.parameters.values) * 0.00001 return score + def RebaseEncoding(self, encoding): + """Take an encoding from another context and rebase it to + this context, using the same encoder arguments.""" + return encoder.Encoding(self.RebaseEncoder(encoding.encoder), + encoding.bitrate, encoding.videofile) + + def RebaseEncoder(self, my_encoder): + return encoder.Encoder(self.context, my_encoder.parameters) + def BestEncoding(self, bitrate, videofile): encodings = self.AllScoredEncodings(bitrate, videofile) if encodings: diff --git a/lib/optimizer_unittest.py b/lib/optimizer_unittest.py index df73e25..1a56dc6 100755 --- a/lib/optimizer_unittest.py +++ b/lib/optimizer_unittest.py @@ -14,6 +14,7 @@ # limitations under the License. """ Unit tests for the optimizer. """ +import os import re import unittest @@ -241,7 +242,9 @@ def EncoderFromParameterString(self, parameter_string): parameter_string)) def test_BestOverallConfigurationNotInWorkDirectory(self): - other_dir = encoder_configuration.conf.sysdir() + '/multirepo_test' + other_dir = os.path.join(encoder_configuration.conf.sysdir(), + 'multirepo_test') + os.mkdir(other_dir) encoder_configuration.conf.override_scorepath_for_test([other_dir]) self.file_set = optimizer.FileAndRateSet(verify_files_present=False) @@ -251,13 +254,15 @@ def test_BestOverallConfigurationNotInWorkDirectory(self): best_encoder = self.optimizer.BestOverallEncoder() self.assertIsNone(best_encoder) # Fill in the database with all the files and rates. + other_context = encoder.Context(self.codec, encoder.EncodingDiskCache, + scoredir='multirepo_test') my_encoder = self.EncoderFromParameterString('--score=7') - my_encoder.context.cache.StoreEncoder(my_encoder, workdir=other_dir) + other_context.cache.StoreEncoder(my_encoder) my_encoder.context.cache.StoreEncoder(my_encoder) for rate, filename in self.file_set.AllFilesAndRates(): my_encoding = my_encoder.Encoding(rate, encoder.Videofile(filename)) my_encoding.Execute() - my_encoding.context.cache.StoreEncoding(my_encoding, workdir=other_dir) + other_context.cache.StoreEncoding(my_encoding) # The best encoder should now be from the workdir, but the results are # all fetched from the searchpath. best_encoder = self.optimizer.BestOverallEncoder() @@ -268,6 +273,26 @@ def test_BestOverallConfigurationNotInWorkDirectory(self): one_encoding.Recover() self.assertTrue(one_encoding.Result()) + def test_MultipleOptimizers(self): + # Make sure other score directories don't interfere with this test. + encoder_configuration.conf.override_scorepath_for_test([]) + + os.mkdir(os.path.join(encoder_configuration.conf.sysdir(), 'first_dir')) + os.mkdir(os.path.join(encoder_configuration.conf.sysdir(), 'second_dir')) + one_optimizer = optimizer.Optimizer(self.codec, scoredir='first_dir') + another_optimizer = optimizer.Optimizer(self.codec, scoredir='second_dir') + self.assertNotEqual(one_optimizer.context.cache.workdir, + another_optimizer.context.cache.workdir) + # Storing one encoding's score should not affect the other's. + one_encoding = one_optimizer.BestEncoding(100, + self.videofile) + one_encoding.Execute().Store() + another_encoding = another_optimizer.BestEncoding(100, self.videofile) + self.assertFalse(another_encoding.Result()) + another_encoding.Recover() + self.assertFalse(another_encoding.Result()) + + class TestFileAndRateSet(unittest.TestCase): From 6c754c5926cd161f64c4e5e53df340558de986a4 Mon Sep 17 00:00:00 2001 From: Harald Alvestrand Date: Tue, 6 Sep 2016 16:21:28 +0200 Subject: [PATCH 2/2] Addressed comments --- lib/encoder.py | 8 +++++--- lib/encoder_unittest.py | 32 ++++++++++++++++++++++++++++++++ 2 files changed, 37 insertions(+), 3 deletions(-) diff --git a/lib/encoder.py b/lib/encoder.py index 2339e44..806454c 100644 --- a/lib/encoder.py +++ b/lib/encoder.py @@ -733,6 +733,7 @@ class EncodingDiskCache(object): """Encoder and encoding information, saved to disk.""" def __init__(self, context, scoredir=None): self.context = context + self.bad_encodings = {} if scoredir: self.workdir = os.path.join(encoder_configuration.conf.sysdir(), scoredir, context.codec.name) @@ -772,10 +773,11 @@ def _FilesToEncodings(self, files, videofile, bitrate=None, videofile or _FileNameToVideofile(full_filename)) try: candidate.Recover() - except Error: - print 'Ignoring error in file %s' % full_filename + candidates.append(candidate) + except Error as err: + # TODO(hta): Change this to a more specific catch once available. + self.bad_encodings[full_filename] = err continue - candidates.append(candidate) return candidates def _QueryScoredEncodings(self, encoder=None, bitrate=None, videofile=None): diff --git a/lib/encoder_unittest.py b/lib/encoder_unittest.py index 170ab87..b0be6a0 100755 --- a/lib/encoder_unittest.py +++ b/lib/encoder_unittest.py @@ -615,6 +615,38 @@ def testEncodersInMultipleRepos(self): encoder_configuration.conf.override_scorepath_for_test([]) self.assertFalse(cache.ReadEncodingResult(my_encoding)) + def testBrokenStoredEncoding(self): + context = StorageOnlyContext() + other_dir = os.path.join(encoder_configuration.conf.sysdir(), + 'broken_files') + os.mkdir(other_dir) + + cache = encoder.EncodingDiskCache(context, scoredir='broken_files') + # This particular test needs the context to know about the cache. + context.cache = cache + my_encoder = encoder.Encoder( + context, + encoder.OptionValueSet(encoder.OptionSet(), '--parameters')) + cache.StoreEncoder(my_encoder) + # Cache should start off empty. + self.assertFalse(cache.AllScoredEncodingsForEncoder(my_encoder)) + videofile = encoder.Videofile('x/foo_640_480_20.yuv') + my_encoding = encoder.Encoding(my_encoder, 123, videofile) + testresult = {'foo': 'bar'} + my_encoding.result = testresult + cache.StoreEncoding(my_encoding) + # TODO(hta): Expose the filename generation as a function for testing. + with open(os.path.join(cache.workdir, + my_encoding.encoder.Hashname(), + cache.context.codec.SpeedGroup(my_encoding.bitrate), + '%s.result' % my_encoding.videofile.basename), + 'w') as scorefile: + scorefile.write('stuff that is not valid json') + + result = cache.AllScoredEncodingsForEncoder(my_encoder) + self.assertFalse(result) + self.assertEquals(1, len(cache.bad_encodings)) + class TestEncodingMemoryCache(unittest.TestCase): def testStoreMultipleEncodings(self):