From 0bd8cd384bbbb2062d2850923dfb33dc9c25a0b9 Mon Sep 17 00:00:00 2001 From: Alexander Sulfrian Date: Tue, 21 Jul 2015 20:48:04 +0200 Subject: Lint/MergeFiles: Ignore binary files Ignore files with binary content, because SequenceMatcher seems to have problems and sometimes detect files with different content as identically. --- src/lib/Bcfg2/Server/Lint/MergeFiles.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/lib/Bcfg2/Server/Lint/MergeFiles.py b/src/lib/Bcfg2/Server/Lint/MergeFiles.py index 8e6a926ae..bdb97cee2 100644 --- a/src/lib/Bcfg2/Server/Lint/MergeFiles.py +++ b/src/lib/Bcfg2/Server/Lint/MergeFiles.py @@ -17,6 +17,12 @@ def threshold(val): return rv +def is_binary(data): + """ Check if a given string contains only text or binary data. """ + text_chars = bytearray([7, 8, 9, 10, 12, 13, 27] + range(0x20, 0x100)) + return bool(data.translate(None, text_chars)) + + class MergeFiles(Bcfg2.Server.Lint.ServerPlugin): """ find Probes or Cfg files with multiple similar files that might be merged into one """ @@ -50,6 +56,7 @@ class MergeFiles(Bcfg2.Server.Lint.ServerPlugin): for filename, entryset in self.core.plugins['Cfg'].entries.items(): candidates = dict([(f, e) for f, e in entryset.entries.items() if (isinstance(e, CfgGenerator) and + not is_binary(e.data) and f not in ignore and not f.endswith(".crypt"))]) similar, identical = self.get_similar(candidates) -- cgit v1.2.3-1-g7c22 From 06a6fce3f2f5c78a12937d4e52de3d824e3dd5e0 Mon Sep 17 00:00:00 2001 From: Alexander Sulfrian Date: Wed, 22 Jul 2015 16:23:07 +0200 Subject: Utils: Generalize is_string from POSIX/File is_string from POSIX/File could be used in other situations, too. So we move it to Utils, use it from Lint/MergeFiles and replace a custom is_binary function. --- src/lib/Bcfg2/Client/Tools/POSIX/File.py | 20 +++----------------- src/lib/Bcfg2/Server/Lint/MergeFiles.py | 10 +++------- src/lib/Bcfg2/Utils.py | 16 ++++++++++++++++ 3 files changed, 22 insertions(+), 24 deletions(-) diff --git a/src/lib/Bcfg2/Client/Tools/POSIX/File.py b/src/lib/Bcfg2/Client/Tools/POSIX/File.py index fc445e07c..1f1772d46 100644 --- a/src/lib/Bcfg2/Client/Tools/POSIX/File.py +++ b/src/lib/Bcfg2/Client/Tools/POSIX/File.py @@ -8,6 +8,7 @@ import tempfile import Bcfg2.Options from Bcfg2.Client.Tools.POSIX.base import POSIXTool from Bcfg2.Compat import unicode, b64encode, b64decode # pylint: disable=W0622 +import Bcfg2.Utils class POSIXFile(POSIXTool): @@ -17,21 +18,6 @@ class POSIXFile(POSIXTool): def fully_specified(self, entry): return entry.text is not None or entry.get('empty', 'false') == 'true' - def _is_string(self, strng, encoding): - """ Returns true if the string contains no ASCII control - characters and can be decoded from the specified encoding. """ - for char in strng: - if ord(char) < 9 or ord(char) > 13 and ord(char) < 32: - return False - if not hasattr(strng, "decode"): - # py3k - return True - try: - strng.decode(encoding) - return True - except: # pylint: disable=W0702 - return False - def _get_data(self, entry): """ Get a tuple of (, ) for the given entry """ is_binary = entry.get('encoding', 'ascii') == 'base64' @@ -181,8 +167,8 @@ class POSIXFile(POSIXTool): (entry.get("name"), sys.exc_info()[1])) return False if not is_binary: - is_binary |= not self._is_string(content, - Bcfg2.Options.setup.encoding) + is_binary |= not Bcfg2.Utils.is_string( + content, Bcfg2.Options.setup.encoding) if is_binary: # don't compute diffs if the file is binary prompt.append('Binary file, no printable diff') diff --git a/src/lib/Bcfg2/Server/Lint/MergeFiles.py b/src/lib/Bcfg2/Server/Lint/MergeFiles.py index bdb97cee2..3a6251594 100644 --- a/src/lib/Bcfg2/Server/Lint/MergeFiles.py +++ b/src/lib/Bcfg2/Server/Lint/MergeFiles.py @@ -6,6 +6,7 @@ import copy from difflib import SequenceMatcher import Bcfg2.Server.Lint from Bcfg2.Server.Plugins.Cfg import CfgGenerator +from Bcfg2.Utils import is_string def threshold(val): @@ -17,12 +18,6 @@ def threshold(val): return rv -def is_binary(data): - """ Check if a given string contains only text or binary data. """ - text_chars = bytearray([7, 8, 9, 10, 12, 13, 27] + range(0x20, 0x100)) - return bool(data.translate(None, text_chars)) - - class MergeFiles(Bcfg2.Server.Lint.ServerPlugin): """ find Probes or Cfg files with multiple similar files that might be merged into one """ @@ -56,7 +51,8 @@ class MergeFiles(Bcfg2.Server.Lint.ServerPlugin): for filename, entryset in self.core.plugins['Cfg'].entries.items(): candidates = dict([(f, e) for f, e in entryset.entries.items() if (isinstance(e, CfgGenerator) and - not is_binary(e.data) and + is_string(e.data, + Bcfg2.Options.setup.encoding) and f not in ignore and not f.endswith(".crypt"))]) similar, identical = self.get_similar(candidates) diff --git a/src/lib/Bcfg2/Utils.py b/src/lib/Bcfg2/Utils.py index 10057b63e..64d0d8b93 100644 --- a/src/lib/Bcfg2/Utils.py +++ b/src/lib/Bcfg2/Utils.py @@ -330,3 +330,19 @@ class classproperty(object): # pylint: disable=C0103 def __get__(self, instance, owner): return self.getter(owner) + + +def is_string(strng, encoding): + """ Returns true if the string contains no ASCII control + characters and can be decoded from the specified encoding. """ + for char in strng: + if ord(char) < 9 or ord(char) > 13 and ord(char) < 32: + return False + if not hasattr(strng, "decode"): + # py3k + return True + try: + strng.decode(encoding) + return True + except: # pylint: disable=W0702 + return False -- cgit v1.2.3-1-g7c22 From b54c32c861b517b889b1cd6c0bbe082b8d93e5d2 Mon Sep 17 00:00:00 2001 From: Alexander Sulfrian Date: Wed, 22 Jul 2015 17:15:07 +0200 Subject: tests: is_string is now in Bcfg2.Utils --- .../TestClient/TestTools/TestPOSIX/TestFile.py | 28 +++++----------------- testsuite/Testsrc/Testlib/TestUtils.py | 15 ++++++++++++ 2 files changed, 21 insertions(+), 22 deletions(-) diff --git a/testsuite/Testsrc/Testlib/TestClient/TestTools/TestPOSIX/TestFile.py b/testsuite/Testsrc/Testlib/TestClient/TestTools/TestPOSIX/TestFile.py index 69dd562be..47d3b84ed 100644 --- a/testsuite/Testsrc/Testlib/TestClient/TestTools/TestPOSIX/TestFile.py +++ b/testsuite/Testsrc/Testlib/TestClient/TestTools/TestPOSIX/TestFile.py @@ -37,22 +37,6 @@ class TestPOSIXFile(TestPOSIXTool): entry.text = "text" self.assertTrue(ptool.fully_specified(entry)) - def test_is_string(self): - ptool = self.get_obj() - - for char in list(range(8)) + list(range(14, 32)): - self.assertFalse(ptool._is_string("foo" + chr(char) + "bar", - 'UTF-8')) - for char in list(range(9, 14)) + list(range(33, 128)): - self.assertTrue(ptool._is_string("foo" + chr(char) + "bar", - 'UTF-8')) - ustr = 'é' - self.assertTrue(ptool._is_string(ustr, 'UTF-8')) - if not inPy3k: - self.assertFalse(ptool._is_string("foo" + chr(128) + "bar", - 'ascii')) - self.assertFalse(ptool._is_string(ustr, 'ascii')) - def test_get_data(self): orig_entry = lxml.etree.Element("Path", name="/test", type="file") Bcfg2.Options.setup.encoding = "ascii" @@ -216,7 +200,8 @@ class TestPOSIXFile(TestPOSIXTool): mock_unlink.assert_called_with(newfile) @patch("%s.open" % builtins) - def test__get_diffs(self, mock_open): + @patch("Bcfg2.Utils") + def test__get_diffs(self, mock_utils, mock_open): orig_entry = lxml.etree.Element("Path", name="/test", type="file", mode='0644', owner='root', group='root') @@ -226,16 +211,15 @@ class TestPOSIXFile(TestPOSIXTool): ptool = self.get_obj() ptool._get_data = Mock() ptool._diff = Mock() - ptool._is_string = Mock() def reset(): - ptool._is_string.reset_mock() + mock_utils.is_string.reset_mock() ptool._get_data.reset_mock() ptool._diff.reset_mock() mock_open.reset_mock() return copy.deepcopy(orig_entry) - ptool._is_string.return_value = True + mock_utils.is_string.return_value = True ptool._get_data.return_value = (orig_entry.text, False) mock_open.return_value.read.return_value = ondisk ptool._diff.return_value = ["-test2", "+test"] @@ -250,7 +234,7 @@ class TestPOSIXFile(TestPOSIXTool): # binary data on disk entry = reset() - ptool._is_string.return_value = False + mock_utils.is_string.return_value = False ptool._get_diffs(entry, content=ondisk) self.assertFalse(mock_open.called) self.assertFalse(ptool._diff.called) @@ -258,7 +242,7 @@ class TestPOSIXFile(TestPOSIXTool): # sensitive, non-interactive -- do nothing entry = reset() - ptool._is_string.return_value = True + mock_utils.is_string.return_value = True ptool._get_diffs(entry, sensitive=True, interactive=False) self.assertFalse(mock_open.called) self.assertFalse(ptool._diff.called) diff --git a/testsuite/Testsrc/Testlib/TestUtils.py b/testsuite/Testsrc/Testlib/TestUtils.py index 4bed67248..a37f2ecbe 100644 --- a/testsuite/Testsrc/Testlib/TestUtils.py +++ b/testsuite/Testsrc/Testlib/TestUtils.py @@ -1,3 +1,4 @@ +# -*- coding: utf-8 -*- import os import sys from Bcfg2.Utils import * @@ -42,3 +43,17 @@ class TestPackedDigitRange(Bcfg2TestCase): for test in exc: self.assertNotIn(test, rng) self.assertFalse(rng.includes(test)) + + +class TestIsString(Bcfg2TestCase): + def test_is_string(self): + for char in list(range(8)) + list(range(14, 32)): + self.assertFalse(is_string("foo" + chr(char) + "bar", 'UTF-8')) + for char in list(range(9, 14)) + list(range(33, 128)): + self.assertTrue(is_string("foo" + chr(char) + "bar", 'UTF-8')) + + ustr = 'é' + self.assertTrue(is_string(ustr, 'UTF-8')) + if not inPy3k: + self.assertFalse(is_string("foo" + chr(128) + "bar", 'ascii')) + self.assertFalse(is_string(ustr, 'ascii')) -- cgit v1.2.3-1-g7c22