最近要對付一個每天狂長的痴肥資料庫,由於要保全交易相關證據供日後追查,資料表必須保存往來交換的XML原始檔案。XML格式向來以冗長囉嗦聞名,加上每日交易量愈來愈大,資料庫面臨空間不足及備份管理的壓日益沈重。於是我決定雙管齊下改善這個問題: 第一,將過期資料定期搬到另一個資料庫,就可以採行較寬鬆的備份策略,不用每天反覆備份鉅量歷史資料,時間與空間都較省。第二,我打算祭出壓縮大法,逼XML資料瘦身。

以往在.NET 1.1時代,不難找到Open Source的壓縮Library,例如: SharpZipLib。到了.NET 2.0就更方便了,BCL中內建了System.IO.Compression Namespace,雖然功能陽春,但不需要加掛其他程式庫這點非常迷人。尤其是我想把它整合成SQL的函數...

是的,我打算善用SQL 2005可以用.NET寫User Defined Function的特點,寫出壓縮/解壓函數,以便做出以下的效果:

在以上的範例中,我用OPENROWSET直接讀入一個23M的Log檔,用GZip壓縮成1.2M大小,解壓縮再還原回原始的大小,一切動作都在T-SQL中搞定,如此就算前端是ASP/JSP/PHP,也不必煩惱找不到GZip Library。

GZipCompress主要用來處理IMAGE, VARBINARY等資料型別;如果是字串(VARCHAR, NVARCHAR),可改用GZipCompressString,它會以UTF8將字串解讀成Byte Array,壓縮後再傳回Base64編碼的字串。不過變成Base64編碼的字串,如果存入NVARCHAR, NTEXT這類Unicode欄位時,每個字元都要佔掉兩位Byte,看著好不容易壓縮省下來的空間瞬間又吐了出去,讓人有點心痛。Base64為了確保7-bit的可讀性,只用了64個英數字與符號,不知道有沒有人善用Unicode具備數萬個可讀字元的特色,發明新的二進位資料編碼方式,那就更美妙了~~

GZip壓縮的SQL UDF範例如下:

   1:  using System;
   2:  using System.Data;
   3:  using System.Data.SqlClient;
   4:  using System.Data.SqlTypes;
   5:  using Microsoft.SqlServer.Server;
   6:  using System.IO;
   7:  using System.IO.Compression;
   8:  using System.Text;
   9:   
  10:  public partial class UserDefinedFunctions
  11:  {
  12:  /// <summary>
  13:  /// 將傳入的二進位資料以GZip演算法壓縮
  14:  /// </summary>
  15:  /// <param name="rawData">待壓縮的二進位資料</param>
  16:  /// <returns>經GZip壓縮後的二進位資料</returns>
  17:  [Microsoft.SqlServer.Server.SqlFunction]
  18:  public static SqlBytes GZipCompress(SqlBytes rawData)
  19:  {
  20:      if (rawData.IsNull)
  21:          return SqlBytes.Null;
  22:      else
  23:      {
  24:          byte[] buff = new byte[rawData.Length];
  25:          rawData.Read(0, buff, 0, (int)rawData.Length);
  26:          return new SqlBytes(Compress(buff));
  27:      }
  28:  }
  29:   
  30:  /// <summary>
  31:  /// 將傳入的二進位資料以GZip演算法解壓縮
  32:  /// </summary>
  33:  /// <param name="zippedData">經GZip壓縮後的二進位資料</param>
  34:  /// <returns>還原後的未壓縮原始資料</returns>
  35:  [Microsoft.SqlServer.Server.SqlFunction]
  36:  public static SqlBytes GZipDecompress(SqlBytes zippedData)
  37:  {
  38:      if (zippedData.IsNull)
  39:          return SqlBytes.Null;
  40:      else
  41:      {
  42:          byte[] buff = new byte[zippedData.Length];
  43:          zippedData.Read(0, buff, 0, (int)zippedData.Length);
  44:          return new SqlBytes(Decompress(buff));
  45:      }
  46:  }
  47:   
  48:  /// <summary>
  49:  /// 將傳入的字串以GZip演算法壓縮後,傳回Base64編碼字串
  50:  /// </summary>
  51:  /// <param name="rawString">要壓縮的字串</param>
  52:  /// <returns>壓縮後的字串(Base64)</returns>
  53:  [Microsoft.SqlServer.Server.SqlFunction]
  54:  public static SqlString GZipCompressString(SqlString rawString)
  55:  {
  56:      if (rawString.IsNull)
  57:          return SqlString.Null;
  58:      else
  59:      {
  60:          byte[] rawData = Encoding.UTF8.GetBytes(rawString.ToString());
  61:          byte[] zippedData = Compress(rawData);
  62:          return new SqlString(Convert.ToBase64String(zippedData));
  63:      }
  64:  }
  65:   
  66:  /// <summary>
  67:  /// 將傳入的Base64字串以GZip演算法解壓後,傳回未壓縮前的原始字串
  68:  /// </summary>
  69:  /// <param name="zippedString">壓縮後的字串(Base64)</param>
  70:  /// <returns>解壓還原後的字串</returns>
  71:  [Microsoft.SqlServer.Server.SqlFunction]
  72:  public static SqlString GZipDecompressString(SqlString zippedString)
  73:  {
  74:      if (zippedString.IsNull)
  75:          return SqlString.Null;
  76:      else
  77:      {
  78:          byte[] zippedData = 
  79:              Convert.FromBase64String(zippedString.ToString());
  80:          return new SqlString(
  81:                  Encoding.UTF8.GetString(Decompress(zippedData))
  82:              );
  83:      }
  84:  }
  85:   
  86:  //GZip壓縮
  87:  static byte[] Compress(byte[] rawData)
  88:  {
  89:      MemoryStream ms = new MemoryStream();
  90:      GZipStream compressedzipStream = 
  91:          new GZipStream(ms, CompressionMode.Compress, true);
  92:      compressedzipStream.Write(rawData, 0, rawData.Length);
  93:      compressedzipStream.Close();
  94:      return ms.ToArray();
  95:   
  96:  }
  97:   
  98:  //GZip解壓縮
  99:  static byte[] Decompress(byte[] zippedData)
 100:  {
 101:      MemoryStream ms = new MemoryStream(zippedData);
 102:      GZipStream compressedzipStream = 
 103:          new GZipStream(ms, CompressionMode.Decompress);
 104:      MemoryStream outBuffer = new MemoryStream();
 105:      byte[] block = new byte[1024];
 106:      while (true)
 107:      {
 108:          int bytesRead = compressedzipStream.Read(block, 0, block.Length);
 109:          if (bytesRead <= 0)
 110:              break;
 111:          else
 112:              outBuffer.Write(block, 0, bytesRead);
 113:      }
 114:      compressedzipStream.Close();
 115:      return outBuffer.ToArray();
 116:  }
 117:   
 118:  };
 119:   

 PS: SQLCLR有記憶體位址使用的限制(預設最小為256M),並不會因為實體記憶體充足就無限加大,在壓縮/解壓大量資料時,可能因Memory Pressure而出現SQLCLR記憶體不夠的Exception。


Comments

Be the first to post a comment

Post a comment


24 + 23 =