KB-SQL UDF for Compression
0 |
最近要對付一個每天狂長的痴肥資料庫,由於要保全交易相關證據供日後追查,資料表必須保存往來交換的XML原始檔案。XML格式向來以冗長囉嗦聞名,加上每日交易量愈來愈大,資料庫面臨空間不足及備份管理的壓日益沈重。於是我決定雙管齊下改善這個問題: 第一,將過期資料定期搬到另一個資料庫,就可以採行較寬鬆的備份策略,不用每天反覆備份鉅量歷史資料,時間與空間都較省。第二,我打算祭出壓縮大法,逼XML資料瘦身。
以往在.NET 1.1時代,不難找到Open Source的壓縮Library,例如: SharpZipLib。到了.NET 2.0就更方便了,BCL中內建了System.IO.Compression Namespace,雖然功能陽春,但不需要加掛其他程式庫這點非常迷人。尤其是我想把它整合成SQL的函數...
是的,我打算善用SQL 2005可以用.NET寫User Defined Function的特點,寫出壓縮/解壓函數,以便做出以下的效果:
在以上的範例中,我用OPENROWSET直接讀入一個23M的Log檔,用GZip壓縮成1.2M大小,解壓縮再還原回原始的大小,一切動作都在T-SQL中搞定,如此就算前端是ASP/JSP/PHP,也不必煩惱找不到GZip Library。
GZipCompress主要用來處理IMAGE, VARBINARY等資料型別;如果是字串(VARCHAR, NVARCHAR),可改用GZipCompressString,它會以UTF8將字串解讀成Byte Array,壓縮後再傳回Base64編碼的字串。不過變成Base64編碼的字串,如果存入NVARCHAR, NTEXT這類Unicode欄位時,每個字元都要佔掉兩位Byte,看著好不容易壓縮省下來的空間瞬間又吐了出去,讓人有點心痛。Base64為了確保7-bit的可讀性,只用了64個英數字與符號,不知道有沒有人善用Unicode具備數萬個可讀字元的特色,發明新的二進位資料編碼方式,那就更美妙了~~
GZip壓縮的SQL UDF範例如下:
1: using System;
2: using System.Data;
3: using System.Data.SqlClient;
4: using System.Data.SqlTypes;
5: using Microsoft.SqlServer.Server;
6: using System.IO;
7: using System.IO.Compression;
8: using System.Text;
9:
10: public partial class UserDefinedFunctions
11: {
12: /// <summary>
13: /// 將傳入的二進位資料以GZip演算法壓縮
14: /// </summary>
15: /// <param name="rawData">待壓縮的二進位資料</param>
16: /// <returns>經GZip壓縮後的二進位資料</returns>
17: [Microsoft.SqlServer.Server.SqlFunction]
18: public static SqlBytes GZipCompress(SqlBytes rawData)
19: {
20: if (rawData.IsNull)
21: return SqlBytes.Null;
22: else
23: {
24: byte[] buff = new byte[rawData.Length];
25: rawData.Read(0, buff, 0, (int)rawData.Length);
26: return new SqlBytes(Compress(buff));
27: }
28: }
29:
30: /// <summary>
31: /// 將傳入的二進位資料以GZip演算法解壓縮
32: /// </summary>
33: /// <param name="zippedData">經GZip壓縮後的二進位資料</param>
34: /// <returns>還原後的未壓縮原始資料</returns>
35: [Microsoft.SqlServer.Server.SqlFunction]
36: public static SqlBytes GZipDecompress(SqlBytes zippedData)
37: {
38: if (zippedData.IsNull)
39: return SqlBytes.Null;
40: else
41: {
42: byte[] buff = new byte[zippedData.Length];
43: zippedData.Read(0, buff, 0, (int)zippedData.Length);
44: return new SqlBytes(Decompress(buff));
45: }
46: }
47:
48: /// <summary>
49: /// 將傳入的字串以GZip演算法壓縮後,傳回Base64編碼字串
50: /// </summary>
51: /// <param name="rawString">要壓縮的字串</param>
52: /// <returns>壓縮後的字串(Base64)</returns>
53: [Microsoft.SqlServer.Server.SqlFunction]
54: public static SqlString GZipCompressString(SqlString rawString)
55: {
56: if (rawString.IsNull)
57: return SqlString.Null;
58: else
59: {
60: byte[] rawData = Encoding.UTF8.GetBytes(rawString.ToString());
61: byte[] zippedData = Compress(rawData);
62: return new SqlString(Convert.ToBase64String(zippedData));
63: }
64: }
65:
66: /// <summary>
67: /// 將傳入的Base64字串以GZip演算法解壓後,傳回未壓縮前的原始字串
68: /// </summary>
69: /// <param name="zippedString">壓縮後的字串(Base64)</param>
70: /// <returns>解壓還原後的字串</returns>
71: [Microsoft.SqlServer.Server.SqlFunction]
72: public static SqlString GZipDecompressString(SqlString zippedString)
73: {
74: if (zippedString.IsNull)
75: return SqlString.Null;
76: else
77: {
78: byte[] zippedData =
79: Convert.FromBase64String(zippedString.ToString());
80: return new SqlString(
81: Encoding.UTF8.GetString(Decompress(zippedData))
82: );
83: }
84: }
85:
86: //GZip壓縮
87: static byte[] Compress(byte[] rawData)
88: {
89: MemoryStream ms = new MemoryStream();
90: GZipStream compressedzipStream =
91: new GZipStream(ms, CompressionMode.Compress, true);
92: compressedzipStream.Write(rawData, 0, rawData.Length);
93: compressedzipStream.Close();
94: return ms.ToArray();
95:
96: }
97:
98: //GZip解壓縮
99: static byte[] Decompress(byte[] zippedData)
100: {
101: MemoryStream ms = new MemoryStream(zippedData);
102: GZipStream compressedzipStream =
103: new GZipStream(ms, CompressionMode.Decompress);
104: MemoryStream outBuffer = new MemoryStream();
105: byte[] block = new byte[1024];
106: while (true)
107: {
108: int bytesRead = compressedzipStream.Read(block, 0, block.Length);
109: if (bytesRead <= 0)
110: break;
111: else
112: outBuffer.Write(block, 0, bytesRead);
113: }
114: compressedzipStream.Close();
115: return outBuffer.ToArray();
116: }
117:
118: };
119:
PS: SQLCLR有記憶體位址使用的限制(預設最小為256M),並不會因為實體記憶體充足就無限加大,在壓縮/解壓大量資料時,可能因Memory Pressure而出現SQLCLR記憶體不夠的Exception。
Comments
Be the first to post a comment